diff --git a/include/device.h b/include/device.h index 89ef42b6d..a993aaf5e 100644 --- a/include/device.h +++ b/include/device.h @@ -2,13 +2,14 @@ #include extern "C" { -void *cuda_create_stream(uint32_t gpu_index); +cudaStream_t *cuda_create_stream(uint32_t gpu_index); -int cuda_destroy_stream(void *v_stream, uint32_t gpu_index); +int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index); void *cuda_malloc(uint64_t size, uint32_t gpu_index); -void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index); +void *cuda_malloc_async(uint64_t size, cudaStream_t *stream, + uint32_t gpu_index); int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); @@ -16,20 +17,20 @@ int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size, uint32_t gpu_index); int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, - void *v_stream, uint32_t gpu_index); + cudaStream_t *stream, uint32_t gpu_index); int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size, uint32_t gpu_index); int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, - void *v_stream, uint32_t gpu_index); + cudaStream_t *stream, uint32_t gpu_index); int cuda_get_number_of_gpus(); int cuda_synchronize_device(uint32_t gpu_index); int cuda_drop(void *ptr, uint32_t gpu_index); -int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index); +int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index); int cuda_get_max_shared_memory(uint32_t gpu_index); } diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index cdb0876b9..067f2a9c5 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -317,7 +317,7 @@ __host__ void host_bootstrap_amortized( // of shared memory) if (max_shared_memory < SM_PART) { d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count, - *stream, gpu_index); + stream, gpu_index); device_bootstrap_amortized<<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, @@ -328,7 +328,7 @@ __host__ void host_bootstrap_amortized( cudaFuncSetCacheConfig(device_bootstrap_amortized, cudaFuncCachePreferShared); d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count, - *stream, gpu_index); + stream, gpu_index); device_bootstrap_amortized <<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, @@ -346,7 +346,7 @@ __host__ void host_bootstrap_amortized( checkCudaErrors(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); - d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index); + d_mem = (char *)cuda_malloc_async(0, stream, gpu_index); device_bootstrap_amortized <<>>( @@ -359,7 +359,7 @@ __host__ void host_bootstrap_amortized( // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); - cuda_drop_async(d_mem, *stream, gpu_index); + cuda_drop_async(d_mem, stream, gpu_index); } template diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 64c0cdce6..4c2cca490 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -268,9 +268,9 @@ __host__ void host_bootstrap_low_latency( int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count * polynomial_size / 2 * sizeof(double2); double2 *mask_buffer_fft = - (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index); + (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index); double2 *body_buffer_fft = - (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index); + (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index); // With SM each block corresponds to either the mask or body, no need to // duplicate data for each @@ -308,7 +308,7 @@ __host__ void host_bootstrap_low_latency( checkCudaErrors(cudaGetLastError()); d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count * level_count * 2, - *stream, gpu_index); + stream, gpu_index); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaLaunchCooperativeKernel( (void *)device_bootstrap_low_latency, grid, thds, @@ -317,7 +317,7 @@ __host__ void host_bootstrap_low_latency( kernel_args[11] = &DM_PART; d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count * level_count * 2, - *stream, gpu_index); + stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( device_bootstrap_low_latency, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART)); @@ -332,7 +332,7 @@ __host__ void host_bootstrap_low_latency( } else { int DM_NONE = 0; kernel_args[11] = &DM_NONE; - d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index); + d_mem = (char *)cuda_malloc_async(0, stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( device_bootstrap_low_latency, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL)); @@ -347,9 +347,9 @@ __host__ void host_bootstrap_low_latency( // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); - cuda_drop_async(mask_buffer_fft, *stream, gpu_index); - cuda_drop_async(body_buffer_fft, *stream, gpu_index); - cuda_drop_async(d_mem, *stream, gpu_index); + cuda_drop_async(mask_buffer_fft, stream, gpu_index); + cuda_drop_async(body_buffer_fft, stream, gpu_index); + cuda_drop_async(d_mem, stream, gpu_index); } #endif // LOWLAT_PBS_H diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index b626a6fed..8e0fe5486 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -296,10 +296,10 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, (glwe_dimension + 1) * level_count; double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async( - r * ggsw_size * sizeof(double), *stream, gpu_index); + r * ggsw_size * sizeof(double), stream, gpu_index); batch_fft_ggsw_vector( - v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, + stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, level_count, gpu_index, max_shared_memory); ////////////////////// @@ -308,7 +308,7 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, char *d_mem; if (max_shared_memory < memory_needed_per_block) { d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)), - *stream, gpu_index); + stream, gpu_index); } else { checkCudaErrors(cudaFuncSetAttribute( device_batch_cmux, @@ -322,9 +322,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, int glwe_size = (glwe_dimension + 1) * polynomial_size; Torus *d_buffer1 = (Torus *)cuda_malloc_async( - num_lut * glwe_size * sizeof(Torus), *stream, gpu_index); + num_lut * glwe_size * sizeof(Torus), stream, gpu_index); Torus *d_buffer2 = (Torus *)cuda_malloc_async( - num_lut * glwe_size * sizeof(Torus), *stream, gpu_index); + num_lut * glwe_size * sizeof(Torus), stream, gpu_index); checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector, num_lut * glwe_size * sizeof(Torus), @@ -368,11 +368,11 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, checkCudaErrors(cudaStreamSynchronize(*stream)); // Free memory - cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index); - cuda_drop_async(d_buffer1, *stream, gpu_index); - cuda_drop_async(d_buffer2, *stream, gpu_index); + cuda_drop_async(d_ggsw_fft_in, stream, gpu_index); + cuda_drop_async(d_buffer1, stream, gpu_index); + cuda_drop_async(d_buffer2, stream, gpu_index); if (max_shared_memory < memory_needed_per_block) - cuda_drop_async(d_mem, *stream, gpu_index); + cuda_drop_async(d_mem, stream, gpu_index); } // only works for big lwe for ks+bs case @@ -723,7 +723,7 @@ void host_blind_rotate_and_sample_extraction( char *d_mem; if (max_shared_memory < memory_needed_per_block) - d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, *stream, + d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream, gpu_index); else { checkCudaErrors(cudaFuncSetAttribute( @@ -740,11 +740,11 @@ void host_blind_rotate_and_sample_extraction( int ggsw_size = polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget; double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async( - mbr_size * ggsw_size * sizeof(double), *stream, gpu_index); + mbr_size * ggsw_size * sizeof(double), stream, gpu_index); batch_fft_ggsw_vector( - v_stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, - polynomial_size, l_gadget, gpu_index, max_shared_memory); + stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size, + l_gadget, gpu_index, max_shared_memory); checkCudaErrors(cudaGetLastError()); // @@ -768,9 +768,9 @@ void host_blind_rotate_and_sample_extraction( checkCudaErrors(cudaGetLastError()); // - cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index); + cuda_drop_async(d_ggsw_fft_in, stream, gpu_index); if (max_shared_memory < memory_needed_per_block) - cuda_drop_async(d_mem, *stream, gpu_index); + cuda_drop_async(d_mem, stream, gpu_index); } template @@ -852,25 +852,25 @@ __host__ void host_circuit_bootstrap_vertical_packing( int ggsw_size = level_count_cbs * (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size; Torus *ggsw_out = (Torus *)cuda_malloc_async( - number_of_inputs * ggsw_size * sizeof(Torus), *stream, gpu_index); + number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index); // input lwe array for fp-ks Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async( number_of_inputs * level_count_cbs * (glwe_dimension + 1) * (polynomial_size + 1) * sizeof(Torus), - *stream, gpu_index); + stream, gpu_index); // buffer for pbs output Torus *lwe_array_out_pbs_buffer = (Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs * (polynomial_size + 1) * sizeof(Torus), - *stream, gpu_index); + stream, gpu_index); // vector for shifted lwe input Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async( number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus), - *stream, gpu_index); + stream, gpu_index); // lut vector buffer for cbs Torus *lut_vector_cbs = (Torus *)cuda_malloc_async( level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), - *stream, gpu_index); + stream, gpu_index); // indexes of lut vectors for cbs uint32_t *h_lut_vector_indexes = (uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t)); @@ -878,12 +878,10 @@ __host__ void host_circuit_bootstrap_vertical_packing( h_lut_vector_indexes[index] = index % level_count_cbs; } uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async( - number_of_inputs * level_count_cbs * sizeof(uint32_t), *stream, - gpu_index); - cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes, - number_of_inputs * level_count_cbs * - sizeof(uint32_t), - v_stream, gpu_index); + number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); + cuda_memcpy_async_to_gpu( + lut_vector_indexes, h_lut_vector_indexes, + number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); checkCudaErrors(cudaGetLastError()); uint32_t bits = sizeof(Torus) * 8; @@ -898,17 +896,17 @@ __host__ void host_circuit_bootstrap_vertical_packing( base_log_cbs, number_of_inputs, max_shared_memory); checkCudaErrors(cudaGetLastError()); // Free memory - cuda_drop_async(lwe_array_in_fp_ks_buffer, *stream, gpu_index); - cuda_drop_async(lwe_array_in_shifted_buffer, *stream, gpu_index); - cuda_drop_async(lwe_array_out_pbs_buffer, *stream, gpu_index); - cuda_drop_async(lut_vector_cbs, *stream, gpu_index); - cuda_drop_async(lut_vector_indexes, *stream, gpu_index); + cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index); + cuda_drop_async(lut_vector_cbs, stream, gpu_index); + cuda_drop_async(lut_vector_indexes, stream, gpu_index); free(h_lut_vector_indexes); // we need to expand the lut to fill the masks with zeros Torus *lut_vector_glwe = (Torus *)cuda_malloc_async( (glwe_dimension + 1) * lut_number * polynomial_size * sizeof(Torus), - *stream, gpu_index); + stream, gpu_index); int num_blocks = 0, num_threads = 0; int num_entries = glwe_dimension * polynomial_size * lut_number; getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); @@ -928,7 +926,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( (ptrdiff_t)(i * (glwe_dimension + 1) * polynomial_size); // CMUX Tree Torus *glwe_array_out = (Torus *)cuda_malloc_async( - (glwe_dimension + 1) * polynomial_size * sizeof(Torus), *stream, + (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index); checkCudaErrors(cudaGetLastError()); // r = tau * p - log2(N) @@ -947,7 +945,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( number_of_inputs - r, 1, glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs, max_shared_memory); - cuda_drop_async(glwe_array_out, *stream, gpu_index); + cuda_drop_async(glwe_array_out, stream, gpu_index); } } else { @@ -963,7 +961,7 @@ __host__ void host_circuit_bootstrap_vertical_packing( max_shared_memory); } } - cuda_drop_async(ggsw_out, *stream, gpu_index); + cuda_drop_async(ggsw_out, stream, gpu_index); } #endif // WOP_PBS_H diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index 21b00141f..f26a4c2a7 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -117,7 +117,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, switch (polynomial_size) { case 512: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { - buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index); + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -129,14 +129,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, buffer); } else { buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, *stream, gpu_index); + shared_memory_size * total_polynomials, stream, gpu_index); batch_NSMFFT, ForwardFFT>, NOSM> <<>>(d_bsk, dest, buffer); } break; case 1024: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { - buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index); + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -148,14 +148,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, buffer); } else { buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, *stream, gpu_index); + shared_memory_size * total_polynomials, stream, gpu_index); batch_NSMFFT, ForwardFFT>, NOSM> <<>>(d_bsk, dest, buffer); } break; case 2048: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { - buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index); + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -167,14 +167,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, buffer); } else { buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, *stream, gpu_index); + shared_memory_size * total_polynomials, stream, gpu_index); batch_NSMFFT, ForwardFFT>, NOSM> <<>>(d_bsk, dest, buffer); } break; case 4096: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { - buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index); + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -186,14 +186,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, buffer); } else { buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, *stream, gpu_index); + shared_memory_size * total_polynomials, stream, gpu_index); batch_NSMFFT, ForwardFFT>, NOSM> <<>>(d_bsk, dest, buffer); } break; case 8192: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { - buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index); + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); checkCudaErrors(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -205,7 +205,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, buffer); } else { buffer = (double2 *)cuda_malloc_async( - shared_memory_size * total_polynomials, *stream, gpu_index); + shared_memory_size * total_polynomials, stream, gpu_index); batch_NSMFFT, ForwardFFT>, NOSM> <<>>(d_bsk, dest, buffer); } @@ -214,8 +214,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, break; } - cuda_drop_async(d_bsk, *stream, gpu_index); - cuda_drop_async(buffer, *stream, gpu_index); + cuda_drop_async(d_bsk, stream, gpu_index); + cuda_drop_async(buffer, stream, gpu_index); free(h_bsk); } diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh index 4f5e70825..d258b1e8e 100644 --- a/src/crypto/ggsw.cuh +++ b/src/crypto/ggsw.cuh @@ -49,12 +49,10 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src, * global memory */ template -void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r, - uint32_t glwe_dim, uint32_t polynomial_size, - uint32_t level_count, uint32_t gpu_index, - uint32_t max_shared_memory) { - - auto stream = static_cast(v_stream); +void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src, + uint32_t r, uint32_t glwe_dim, + uint32_t polynomial_size, uint32_t level_count, + uint32_t gpu_index, uint32_t max_shared_memory) { int shared_memory_size = sizeof(double) * polynomial_size; @@ -63,11 +61,11 @@ void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r, char *d_mem; if (max_shared_memory < shared_memory_size) { - d_mem = (char *)cuda_malloc_async(shared_memory_size, *stream, gpu_index); + d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index); device_batch_fft_ggsw_vector <<>>(dest, src, d_mem); checkCudaErrors(cudaGetLastError()); - cuda_drop_async(d_mem, *stream, gpu_index); + cuda_drop_async(d_mem, stream, gpu_index); } else { device_batch_fft_ggsw_vector <<>>(dest, src, diff --git a/src/device.cu b/src/device.cu index e77fbf9e2..d890a49ca 100644 --- a/src/device.cu +++ b/src/device.cu @@ -5,7 +5,7 @@ #include /// Unsafe function to create a CUDA stream, must check first that GPU exists -void *cuda_create_stream(uint32_t gpu_index) { +cudaStream_t *cuda_create_stream(uint32_t gpu_index) { cudaSetDevice(gpu_index); cudaStream_t *stream = new cudaStream_t; cudaStreamCreate(stream); @@ -13,9 +13,8 @@ void *cuda_create_stream(uint32_t gpu_index) { } /// Unsafe function to destroy CUDA stream, must check first the GPU exists -int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) { +int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) { cudaSetDevice(gpu_index); - auto stream = static_cast(v_stream); cudaStreamDestroy(*stream); return 0; } @@ -33,7 +32,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) { /// Allocates a size-byte array at the device memory. Tries to do it /// asynchronously. -void *cuda_malloc_async(uint64_t size, cudaStream_t stream, +void *cuda_malloc_async(uint64_t size, cudaStream_t *stream, uint32_t gpu_index) { void *ptr; @@ -42,7 +41,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); if (support_async_alloc) - checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream)); + checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream)); else checkCudaErrors(cudaMalloc((void **)&ptr, size)); return ptr; @@ -74,7 +73,7 @@ int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) { /// -2: error, gpu index doesn't exist /// -3: error, zero copy size int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, - void *v_stream, uint32_t gpu_index) { + cudaStream_t *stream, uint32_t gpu_index) { if (size == 0) { // error code: zero copy size return -3; @@ -90,7 +89,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, // error code: invalid device pointer return -1; } - auto stream = static_cast(v_stream); + cudaSetDevice(gpu_index); checkCudaErrors( cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream)); @@ -116,7 +115,7 @@ int cuda_synchronize_device(uint32_t gpu_index) { /// -2: error, gpu index doesn't exist /// -3: error, zero copy size int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, - void *v_stream, uint32_t gpu_index) { + cudaStream_t *stream, uint32_t gpu_index) { if (size == 0) { // error code: zero copy size return -3; @@ -132,7 +131,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, // error code: invalid device pointer return -1; } - auto stream = static_cast(v_stream); + cudaSetDevice(gpu_index); checkCudaErrors( cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream)); @@ -158,14 +157,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) { } /// Drop a cuda array. Tries to do it asynchronously -int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) { +int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) { int support_async_alloc; checkCudaErrors(cudaDeviceGetAttribute( &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index)); if (support_async_alloc) - checkCudaErrors(cudaFreeAsync(ptr, stream)); + checkCudaErrors(cudaFreeAsync(ptr, *stream)); else checkCudaErrors(cudaFree(ptr)); return 0;