mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
chore(cuda): Refactor device.cu functions to take pointers to cudaStream_t instead of void
This commit is contained in:
@@ -2,13 +2,14 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
extern "C" {
|
||||
void *cuda_create_stream(uint32_t gpu_index);
|
||||
cudaStream_t *cuda_create_stream(uint32_t gpu_index);
|
||||
|
||||
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
|
||||
int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
|
||||
uint32_t gpu_index);
|
||||
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
@@ -16,20 +17,20 @@ int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index);
|
||||
cudaStream_t *stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index);
|
||||
cudaStream_t *stream, uint32_t gpu_index);
|
||||
int cuda_get_number_of_gpus();
|
||||
|
||||
int cuda_synchronize_device(uint32_t gpu_index);
|
||||
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index);
|
||||
|
||||
int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
|
||||
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
|
||||
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index);
|
||||
}
|
||||
|
||||
@@ -317,7 +317,7 @@ __host__ void host_bootstrap_amortized(
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
|
||||
@@ -328,7 +328,7 @@ __host__ void host_bootstrap_amortized(
|
||||
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
@@ -346,7 +346,7 @@ __host__ void host_bootstrap_amortized(
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
|
||||
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
@@ -359,7 +359,7 @@ __host__ void host_bootstrap_amortized(
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
|
||||
@@ -268,9 +268,9 @@ __host__ void host_bootstrap_low_latency(
|
||||
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
|
||||
double2 *body_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -308,7 +308,7 @@ __host__ void host_bootstrap_low_latency(
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
|
||||
@@ -317,7 +317,7 @@ __host__ void host_bootstrap_low_latency(
|
||||
kernel_args[11] = &DM_PART;
|
||||
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
|
||||
@@ -332,7 +332,7 @@ __host__ void host_bootstrap_low_latency(
|
||||
} else {
|
||||
int DM_NONE = 0;
|
||||
kernel_args[11] = &DM_NONE;
|
||||
d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
@@ -347,9 +347,9 @@ __host__ void host_bootstrap_low_latency(
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(mask_buffer_fft, *stream, gpu_index);
|
||||
cuda_drop_async(body_buffer_fft, *stream, gpu_index);
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(body_buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
|
||||
@@ -296,10 +296,10 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
(glwe_dimension + 1) * level_count;
|
||||
|
||||
double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
|
||||
r * ggsw_size * sizeof(double), *stream, gpu_index);
|
||||
r * ggsw_size * sizeof(double), stream, gpu_index);
|
||||
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(
|
||||
v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
|
||||
stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
|
||||
level_count, gpu_index, max_shared_memory);
|
||||
|
||||
//////////////////////
|
||||
@@ -308,7 +308,7 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
char *d_mem;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)),
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
} else {
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
@@ -322,9 +322,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
Torus *d_buffer1 = (Torus *)cuda_malloc_async(
|
||||
num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
|
||||
num_lut * glwe_size * sizeof(Torus), stream, gpu_index);
|
||||
Torus *d_buffer2 = (Torus *)cuda_malloc_async(
|
||||
num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
|
||||
num_lut * glwe_size * sizeof(Torus), stream, gpu_index);
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
|
||||
num_lut * glwe_size * sizeof(Torus),
|
||||
@@ -368,11 +368,11 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
|
||||
// Free memory
|
||||
cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
|
||||
cuda_drop_async(d_buffer1, *stream, gpu_index);
|
||||
cuda_drop_async(d_buffer2, *stream, gpu_index);
|
||||
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
|
||||
cuda_drop_async(d_buffer1, stream, gpu_index);
|
||||
cuda_drop_async(d_buffer2, stream, gpu_index);
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
// only works for big lwe for ks+bs case
|
||||
@@ -723,7 +723,7 @@ void host_blind_rotate_and_sample_extraction(
|
||||
|
||||
char *d_mem;
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, *stream,
|
||||
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream,
|
||||
gpu_index);
|
||||
else {
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
@@ -740,11 +740,11 @@ void host_blind_rotate_and_sample_extraction(
|
||||
int ggsw_size =
|
||||
polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
|
||||
mbr_size * ggsw_size * sizeof(double), *stream, gpu_index);
|
||||
mbr_size * ggsw_size * sizeof(double), stream, gpu_index);
|
||||
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(
|
||||
v_stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension,
|
||||
polynomial_size, l_gadget, gpu_index, max_shared_memory);
|
||||
stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size,
|
||||
l_gadget, gpu_index, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
//
|
||||
@@ -768,9 +768,9 @@ void host_blind_rotate_and_sample_extraction(
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
//
|
||||
cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
|
||||
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -852,25 +852,25 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * polynomial_size;
|
||||
Torus *ggsw_out = (Torus *)cuda_malloc_async(
|
||||
number_of_inputs * ggsw_size * sizeof(Torus), *stream, gpu_index);
|
||||
number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index);
|
||||
// input lwe array for fp-ks
|
||||
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async(
|
||||
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
|
||||
(polynomial_size + 1) * sizeof(Torus),
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
// buffer for pbs output
|
||||
Torus *lwe_array_out_pbs_buffer =
|
||||
(Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs *
|
||||
(polynomial_size + 1) * sizeof(Torus),
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
// vector for shifted lwe input
|
||||
Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async(
|
||||
number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus),
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
// lut vector buffer for cbs
|
||||
Torus *lut_vector_cbs = (Torus *)cuda_malloc_async(
|
||||
level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
// indexes of lut vectors for cbs
|
||||
uint32_t *h_lut_vector_indexes =
|
||||
(uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t));
|
||||
@@ -878,12 +878,10 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
h_lut_vector_indexes[index] = index % level_count_cbs;
|
||||
}
|
||||
uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async(
|
||||
number_of_inputs * level_count_cbs * sizeof(uint32_t), *stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
|
||||
number_of_inputs * level_count_cbs *
|
||||
sizeof(uint32_t),
|
||||
v_stream, gpu_index);
|
||||
number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
|
||||
cuda_memcpy_async_to_gpu(
|
||||
lut_vector_indexes, h_lut_vector_indexes,
|
||||
number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
uint32_t bits = sizeof(Torus) * 8;
|
||||
@@ -898,17 +896,17 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
base_log_cbs, number_of_inputs, max_shared_memory);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
// Free memory
|
||||
cuda_drop_async(lwe_array_in_fp_ks_buffer, *stream, gpu_index);
|
||||
cuda_drop_async(lwe_array_in_shifted_buffer, *stream, gpu_index);
|
||||
cuda_drop_async(lwe_array_out_pbs_buffer, *stream, gpu_index);
|
||||
cuda_drop_async(lut_vector_cbs, *stream, gpu_index);
|
||||
cuda_drop_async(lut_vector_indexes, *stream, gpu_index);
|
||||
cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index);
|
||||
cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index);
|
||||
cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index);
|
||||
cuda_drop_async(lut_vector_cbs, stream, gpu_index);
|
||||
cuda_drop_async(lut_vector_indexes, stream, gpu_index);
|
||||
free(h_lut_vector_indexes);
|
||||
|
||||
// we need to expand the lut to fill the masks with zeros
|
||||
Torus *lut_vector_glwe = (Torus *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * lut_number * polynomial_size * sizeof(Torus),
|
||||
*stream, gpu_index);
|
||||
stream, gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = glwe_dimension * polynomial_size * lut_number;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -928,7 +926,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
(ptrdiff_t)(i * (glwe_dimension + 1) * polynomial_size);
|
||||
// CMUX Tree
|
||||
Torus *glwe_array_out = (Torus *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), *stream,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
// r = tau * p - log2(N)
|
||||
@@ -947,7 +945,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
number_of_inputs - r, 1, glwe_dimension, polynomial_size,
|
||||
base_log_cbs, level_count_cbs, max_shared_memory);
|
||||
|
||||
cuda_drop_async(glwe_array_out, *stream, gpu_index);
|
||||
cuda_drop_async(glwe_array_out, stream, gpu_index);
|
||||
}
|
||||
|
||||
} else {
|
||||
@@ -963,7 +961,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
max_shared_memory);
|
||||
}
|
||||
}
|
||||
cuda_drop_async(ggsw_out, *stream, gpu_index);
|
||||
cuda_drop_async(ggsw_out, stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // WOP_PBS_H
|
||||
|
||||
@@ -117,7 +117,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -129,14 +129,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -148,14 +148,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -167,14 +167,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -186,14 +186,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
@@ -205,7 +205,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
@@ -214,8 +214,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
break;
|
||||
}
|
||||
|
||||
cuda_drop_async(d_bsk, *stream, gpu_index);
|
||||
cuda_drop_async(buffer, *stream, gpu_index);
|
||||
cuda_drop_async(d_bsk, stream, gpu_index);
|
||||
cuda_drop_async(buffer, stream, gpu_index);
|
||||
free(h_bsk);
|
||||
}
|
||||
|
||||
|
||||
@@ -49,12 +49,10 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
|
||||
* global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t gpu_index,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
|
||||
uint32_t r, uint32_t glwe_dim,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t gpu_index, uint32_t max_shared_memory) {
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
@@ -63,11 +61,11 @@ void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
|
||||
|
||||
char *d_mem;
|
||||
if (max_shared_memory < shared_memory_size) {
|
||||
d_mem = (char *)cuda_malloc_async(shared_memory_size, *stream, gpu_index);
|
||||
d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index);
|
||||
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
} else {
|
||||
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
#include <helper_cuda.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
void *cuda_create_stream(uint32_t gpu_index) {
|
||||
cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaStream_t *stream = new cudaStream_t;
|
||||
cudaStreamCreate(stream);
|
||||
@@ -13,9 +13,8 @@ void *cuda_create_stream(uint32_t gpu_index) {
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
|
||||
int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaStreamDestroy(*stream);
|
||||
return 0;
|
||||
}
|
||||
@@ -33,7 +32,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
/// Allocates a size-byte array at the device memory. Tries to do it
|
||||
/// asynchronously.
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
|
||||
uint32_t gpu_index) {
|
||||
void *ptr;
|
||||
|
||||
@@ -42,7 +41,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
|
||||
if (support_async_alloc)
|
||||
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream));
|
||||
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
|
||||
else
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
return ptr;
|
||||
@@ -74,7 +73,7 @@ int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index) {
|
||||
cudaStream_t *stream, uint32_t gpu_index) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
@@ -90,7 +89,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
|
||||
@@ -116,7 +115,7 @@ int cuda_synchronize_device(uint32_t gpu_index) {
|
||||
/// -2: error, gpu index doesn't exist
|
||||
/// -3: error, zero copy size
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index) {
|
||||
cudaStream_t *stream, uint32_t gpu_index) {
|
||||
if (size == 0) {
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
@@ -132,7 +131,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
|
||||
@@ -158,14 +157,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
}
|
||||
|
||||
/// Drop a cuda array. Tries to do it asynchronously
|
||||
int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
|
||||
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
|
||||
|
||||
int support_async_alloc;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(
|
||||
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
|
||||
|
||||
if (support_async_alloc)
|
||||
checkCudaErrors(cudaFreeAsync(ptr, stream));
|
||||
checkCudaErrors(cudaFreeAsync(ptr, *stream));
|
||||
else
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user