chore(cuda): Refactor device.cu functions to take pointers to cudaStream_t instead of void

This commit is contained in:
Pedro Alves
2022-12-14 10:15:43 -03:00
committed by Agnès Leroy
parent e4ba380594
commit 9bcf0f8a70
7 changed files with 80 additions and 84 deletions

View File

@@ -2,13 +2,14 @@
#include <cuda_runtime.h>
extern "C" {
void *cuda_create_stream(uint32_t gpu_index);
cudaStream_t *cuda_create_stream(uint32_t gpu_index);
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
uint32_t gpu_index);
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
@@ -16,20 +17,20 @@ int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
uint32_t gpu_index);
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void *v_stream, uint32_t gpu_index);
cudaStream_t *stream, uint32_t gpu_index);
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
uint32_t gpu_index);
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
void *v_stream, uint32_t gpu_index);
cudaStream_t *stream, uint32_t gpu_index);
int cuda_get_number_of_gpus();
int cuda_synchronize_device(uint32_t gpu_index);
int cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
int cuda_get_max_shared_memory(uint32_t gpu_index);
}

View File

@@ -317,7 +317,7 @@ __host__ void host_bootstrap_amortized(
// of shared memory)
if (max_shared_memory < SM_PART) {
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
*stream, gpu_index);
stream, gpu_index);
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
@@ -328,7 +328,7 @@ __host__ void host_bootstrap_amortized(
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
*stream, gpu_index);
stream, gpu_index);
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
@@ -346,7 +346,7 @@ __host__ void host_bootstrap_amortized(
checkCudaErrors(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, *stream>>>(
@@ -359,7 +359,7 @@ __host__ void host_bootstrap_amortized(
// Synchronize the streams before copying the result to lwe_array_out at the
// right place
cudaStreamSynchronize(*stream);
cuda_drop_async(d_mem, *stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);
}
template <typename Torus, class params>

View File

@@ -268,9 +268,9 @@ __host__ void host_bootstrap_low_latency(
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
double2 *mask_buffer_fft =
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
double2 *body_buffer_fft =
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
@@ -308,7 +308,7 @@ __host__ void host_bootstrap_low_latency(
checkCudaErrors(cudaGetLastError());
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
level_count * 2,
*stream, gpu_index);
stream, gpu_index);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
@@ -317,7 +317,7 @@ __host__ void host_bootstrap_low_latency(
kernel_args[11] = &DM_PART;
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
level_count * 2,
*stream, gpu_index);
stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
@@ -332,7 +332,7 @@ __host__ void host_bootstrap_low_latency(
} else {
int DM_NONE = 0;
kernel_args[11] = &DM_NONE;
d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
device_bootstrap_low_latency<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
@@ -347,9 +347,9 @@ __host__ void host_bootstrap_low_latency(
// Synchronize the streams before copying the result to lwe_array_out at the
// right place
cudaStreamSynchronize(*stream);
cuda_drop_async(mask_buffer_fft, *stream, gpu_index);
cuda_drop_async(body_buffer_fft, *stream, gpu_index);
cuda_drop_async(d_mem, *stream, gpu_index);
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
cuda_drop_async(body_buffer_fft, stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);
}
#endif // LOWLAT_PBS_H

View File

@@ -296,10 +296,10 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
(glwe_dimension + 1) * level_count;
double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
r * ggsw_size * sizeof(double), *stream, gpu_index);
r * ggsw_size * sizeof(double), stream, gpu_index);
batch_fft_ggsw_vector<Torus, STorus, params>(
v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
level_count, gpu_index, max_shared_memory);
//////////////////////
@@ -308,7 +308,7 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
char *d_mem;
if (max_shared_memory < memory_needed_per_block) {
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)),
*stream, gpu_index);
stream, gpu_index);
} else {
checkCudaErrors(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
@@ -322,9 +322,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
int glwe_size = (glwe_dimension + 1) * polynomial_size;
Torus *d_buffer1 = (Torus *)cuda_malloc_async(
num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
num_lut * glwe_size * sizeof(Torus), stream, gpu_index);
Torus *d_buffer2 = (Torus *)cuda_malloc_async(
num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
num_lut * glwe_size * sizeof(Torus), stream, gpu_index);
checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
num_lut * glwe_size * sizeof(Torus),
@@ -368,11 +368,11 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
checkCudaErrors(cudaStreamSynchronize(*stream));
// Free memory
cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
cuda_drop_async(d_buffer1, *stream, gpu_index);
cuda_drop_async(d_buffer2, *stream, gpu_index);
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
cuda_drop_async(d_buffer1, stream, gpu_index);
cuda_drop_async(d_buffer2, stream, gpu_index);
if (max_shared_memory < memory_needed_per_block)
cuda_drop_async(d_mem, *stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);
}
// only works for big lwe for ks+bs case
@@ -723,7 +723,7 @@ void host_blind_rotate_and_sample_extraction(
char *d_mem;
if (max_shared_memory < memory_needed_per_block)
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, *stream,
d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream,
gpu_index);
else {
checkCudaErrors(cudaFuncSetAttribute(
@@ -740,11 +740,11 @@ void host_blind_rotate_and_sample_extraction(
int ggsw_size =
polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
mbr_size * ggsw_size * sizeof(double), *stream, gpu_index);
mbr_size * ggsw_size * sizeof(double), stream, gpu_index);
batch_fft_ggsw_vector<Torus, STorus, params>(
v_stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension,
polynomial_size, l_gadget, gpu_index, max_shared_memory);
stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size,
l_gadget, gpu_index, max_shared_memory);
checkCudaErrors(cudaGetLastError());
//
@@ -768,9 +768,9 @@ void host_blind_rotate_and_sample_extraction(
checkCudaErrors(cudaGetLastError());
//
cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
if (max_shared_memory < memory_needed_per_block)
cuda_drop_async(d_mem, *stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);
}
template <typename Torus, class params>
@@ -852,25 +852,25 @@ __host__ void host_circuit_bootstrap_vertical_packing(
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
Torus *ggsw_out = (Torus *)cuda_malloc_async(
number_of_inputs * ggsw_size * sizeof(Torus), *stream, gpu_index);
number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index);
// input lwe array for fp-ks
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1) * sizeof(Torus),
*stream, gpu_index);
stream, gpu_index);
// buffer for pbs output
Torus *lwe_array_out_pbs_buffer =
(Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs *
(polynomial_size + 1) * sizeof(Torus),
*stream, gpu_index);
stream, gpu_index);
// vector for shifted lwe input
Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus),
*stream, gpu_index);
stream, gpu_index);
// lut vector buffer for cbs
Torus *lut_vector_cbs = (Torus *)cuda_malloc_async(
level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
*stream, gpu_index);
stream, gpu_index);
// indexes of lut vectors for cbs
uint32_t *h_lut_vector_indexes =
(uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t));
@@ -878,12 +878,10 @@ __host__ void host_circuit_bootstrap_vertical_packing(
h_lut_vector_indexes[index] = index % level_count_cbs;
}
uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async(
number_of_inputs * level_count_cbs * sizeof(uint32_t), *stream,
gpu_index);
cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
number_of_inputs * level_count_cbs *
sizeof(uint32_t),
v_stream, gpu_index);
number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
cuda_memcpy_async_to_gpu(
lut_vector_indexes, h_lut_vector_indexes,
number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
checkCudaErrors(cudaGetLastError());
uint32_t bits = sizeof(Torus) * 8;
@@ -898,17 +896,17 @@ __host__ void host_circuit_bootstrap_vertical_packing(
base_log_cbs, number_of_inputs, max_shared_memory);
checkCudaErrors(cudaGetLastError());
// Free memory
cuda_drop_async(lwe_array_in_fp_ks_buffer, *stream, gpu_index);
cuda_drop_async(lwe_array_in_shifted_buffer, *stream, gpu_index);
cuda_drop_async(lwe_array_out_pbs_buffer, *stream, gpu_index);
cuda_drop_async(lut_vector_cbs, *stream, gpu_index);
cuda_drop_async(lut_vector_indexes, *stream, gpu_index);
cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index);
cuda_drop_async(lut_vector_cbs, stream, gpu_index);
cuda_drop_async(lut_vector_indexes, stream, gpu_index);
free(h_lut_vector_indexes);
// we need to expand the lut to fill the masks with zeros
Torus *lut_vector_glwe = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * lut_number * polynomial_size * sizeof(Torus),
*stream, gpu_index);
stream, gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = glwe_dimension * polynomial_size * lut_number;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -928,7 +926,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
(ptrdiff_t)(i * (glwe_dimension + 1) * polynomial_size);
// CMUX Tree
Torus *glwe_array_out = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), *stream,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
gpu_index);
checkCudaErrors(cudaGetLastError());
// r = tau * p - log2(N)
@@ -947,7 +945,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
number_of_inputs - r, 1, glwe_dimension, polynomial_size,
base_log_cbs, level_count_cbs, max_shared_memory);
cuda_drop_async(glwe_array_out, *stream, gpu_index);
cuda_drop_async(glwe_array_out, stream, gpu_index);
}
} else {
@@ -963,7 +961,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
max_shared_memory);
}
}
cuda_drop_async(ggsw_out, *stream, gpu_index);
cuda_drop_async(ggsw_out, stream, gpu_index);
}
#endif // WOP_PBS_H

View File

@@ -117,7 +117,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
switch (polynomial_size) {
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -129,14 +129,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -148,14 +148,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -167,14 +167,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -186,14 +186,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
checkCudaErrors(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -205,7 +205,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
@@ -214,8 +214,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
break;
}
cuda_drop_async(d_bsk, *stream, gpu_index);
cuda_drop_async(buffer, *stream, gpu_index);
cuda_drop_async(d_bsk, stream, gpu_index);
cuda_drop_async(buffer, stream, gpu_index);
free(h_bsk);
}

View File

@@ -49,12 +49,10 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t level_count, uint32_t gpu_index,
uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
uint32_t r, uint32_t glwe_dim,
uint32_t polynomial_size, uint32_t level_count,
uint32_t gpu_index, uint32_t max_shared_memory) {
int shared_memory_size = sizeof(double) * polynomial_size;
@@ -63,11 +61,11 @@ void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
char *d_mem;
if (max_shared_memory < shared_memory_size) {
d_mem = (char *)cuda_malloc_async(shared_memory_size, *stream, gpu_index);
d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index);
device_batch_fft_ggsw_vector<T, ST, params, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
checkCudaErrors(cudaGetLastError());
cuda_drop_async(d_mem, *stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);
} else {
device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,

View File

@@ -5,7 +5,7 @@
#include <helper_cuda.h>
/// Unsafe function to create a CUDA stream, must check first that GPU exists
void *cuda_create_stream(uint32_t gpu_index) {
cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
cudaSetDevice(gpu_index);
cudaStream_t *stream = new cudaStream_t;
cudaStreamCreate(stream);
@@ -13,9 +13,8 @@ void *cuda_create_stream(uint32_t gpu_index) {
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaStreamDestroy(*stream);
return 0;
}
@@ -33,7 +32,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
/// Allocates a size-byte array at the device memory. Tries to do it
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
uint32_t gpu_index) {
void *ptr;
@@ -42,7 +41,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
if (support_async_alloc)
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream));
checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
else
checkCudaErrors(cudaMalloc((void **)&ptr, size));
return ptr;
@@ -74,7 +73,7 @@ int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void *v_stream, uint32_t gpu_index) {
cudaStream_t *stream, uint32_t gpu_index) {
if (size == 0) {
// error code: zero copy size
return -3;
@@ -90,7 +89,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
// error code: invalid device pointer
return -1;
}
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaSetDevice(gpu_index);
checkCudaErrors(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
@@ -116,7 +115,7 @@ int cuda_synchronize_device(uint32_t gpu_index) {
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
void *v_stream, uint32_t gpu_index) {
cudaStream_t *stream, uint32_t gpu_index) {
if (size == 0) {
// error code: zero copy size
return -3;
@@ -132,7 +131,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
// error code: invalid device pointer
return -1;
}
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaSetDevice(gpu_index);
checkCudaErrors(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
@@ -158,14 +157,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
}
/// Drop a cuda array. Tries to do it asynchronously
int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
int support_async_alloc;
checkCudaErrors(cudaDeviceGetAttribute(
&support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
if (support_async_alloc)
checkCudaErrors(cudaFreeAsync(ptr, stream));
checkCudaErrors(cudaFreeAsync(ptr, *stream));
else
checkCudaErrors(cudaFree(ptr));
return 0;