diff --git a/include/device.h b/include/device.h
index 89ef42b6d..a993aaf5e 100644
--- a/include/device.h
+++ b/include/device.h
@@ -2,13 +2,14 @@
 #include <cuda_runtime.h>
 
 extern "C" {
-void *cuda_create_stream(uint32_t gpu_index);
+cudaStream_t *cuda_create_stream(uint32_t gpu_index);
 
-int cuda_destroy_stream(void *v_stream, uint32_t gpu_index);
+int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index);
 
 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
 
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
+void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
+                        uint32_t gpu_index);
 
 int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 
@@ -16,20 +17,20 @@ int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
                        uint32_t gpu_index);
 
 int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                             void *v_stream, uint32_t gpu_index);
+                             cudaStream_t *stream, uint32_t gpu_index);
 
 int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
                        uint32_t gpu_index);
 
 int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                             void *v_stream, uint32_t gpu_index);
+                             cudaStream_t *stream, uint32_t gpu_index);
 int cuda_get_number_of_gpus();
 
 int cuda_synchronize_device(uint32_t gpu_index);
 
 int cuda_drop(void *ptr, uint32_t gpu_index);
 
-int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
 
 int cuda_get_max_shared_memory(uint32_t gpu_index);
 }
diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh
index cdb0876b9..067f2a9c5 100644
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -317,7 +317,7 @@ __host__ void host_bootstrap_amortized(
   // of shared memory)
   if (max_shared_memory < SM_PART) {
     d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
-                                      *stream, gpu_index);
+                                      stream, gpu_index);
     device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
         lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
         bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
@@ -328,7 +328,7 @@ __host__ void host_bootstrap_amortized(
     cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
                            cudaFuncCachePreferShared);
     d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
-                                      *stream, gpu_index);
+                                      stream, gpu_index);
     device_bootstrap_amortized<Torus, params, PARTIALSM>
         <<<grid, thds, SM_PART, *stream>>>(
             lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
@@ -346,7 +346,7 @@ __host__ void host_bootstrap_amortized(
     checkCudaErrors(cudaFuncSetCacheConfig(
         device_bootstrap_amortized<Torus, params, FULLSM>,
         cudaFuncCachePreferShared));
-    d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
+    d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
 
     device_bootstrap_amortized<Torus, params, FULLSM>
         <<<grid, thds, SM_FULL, *stream>>>(
@@ -359,7 +359,7 @@ __host__ void host_bootstrap_amortized(
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
   cudaStreamSynchronize(*stream);
-  cuda_drop_async(d_mem, *stream, gpu_index);
+  cuda_drop_async(d_mem, stream, gpu_index);
 }
 
 template <typename Torus, class params>
diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh
index 64c0cdce6..4c2cca490 100644
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -268,9 +268,9 @@ __host__ void host_bootstrap_low_latency(
   int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
                             polynomial_size / 2 * sizeof(double2);
   double2 *mask_buffer_fft =
-      (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
+      (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
   double2 *body_buffer_fft =
-      (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
+      (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
   // duplicate data for each
@@ -308,7 +308,7 @@ __host__ void host_bootstrap_low_latency(
     checkCudaErrors(cudaGetLastError());
     d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
                                           level_count * 2,
-                                      *stream, gpu_index);
+                                      stream, gpu_index);
     checkCudaErrors(cudaGetLastError());
     checkCudaErrors(cudaLaunchCooperativeKernel(
         (void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
@@ -317,7 +317,7 @@ __host__ void host_bootstrap_low_latency(
     kernel_args[11] = &DM_PART;
     d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
                                           level_count * 2,
-                                      *stream, gpu_index);
+                                      stream, gpu_index);
     checkCudaErrors(cudaFuncSetAttribute(
         device_bootstrap_low_latency<Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
@@ -332,7 +332,7 @@ __host__ void host_bootstrap_low_latency(
   } else {
     int DM_NONE = 0;
     kernel_args[11] = &DM_NONE;
-    d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
+    d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
     checkCudaErrors(cudaFuncSetAttribute(
         device_bootstrap_low_latency<Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
@@ -347,9 +347,9 @@ __host__ void host_bootstrap_low_latency(
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
   cudaStreamSynchronize(*stream);
-  cuda_drop_async(mask_buffer_fft, *stream, gpu_index);
-  cuda_drop_async(body_buffer_fft, *stream, gpu_index);
-  cuda_drop_async(d_mem, *stream, gpu_index);
+  cuda_drop_async(mask_buffer_fft, stream, gpu_index);
+  cuda_drop_async(body_buffer_fft, stream, gpu_index);
+  cuda_drop_async(d_mem, stream, gpu_index);
 }
 
 #endif // LOWLAT_PBS_H
diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh
index b626a6fed..8e0fe5486 100644
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -296,10 +296,10 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
                   (glwe_dimension + 1) * level_count;
 
   double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
-      r * ggsw_size * sizeof(double), *stream, gpu_index);
+      r * ggsw_size * sizeof(double), stream, gpu_index);
 
   batch_fft_ggsw_vector<Torus, STorus, params>(
-      v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
+      stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size,
       level_count, gpu_index, max_shared_memory);
 
   //////////////////////
@@ -308,7 +308,7 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
   char *d_mem;
   if (max_shared_memory < memory_needed_per_block) {
     d_mem = (char *)cuda_malloc_async(memory_needed_per_block * (1 << (r - 1)),
-                                      *stream, gpu_index);
+                                      stream, gpu_index);
   } else {
     checkCudaErrors(cudaFuncSetAttribute(
         device_batch_cmux<Torus, STorus, params, FULLSM>,
@@ -322,9 +322,9 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
   int glwe_size = (glwe_dimension + 1) * polynomial_size;
 
   Torus *d_buffer1 = (Torus *)cuda_malloc_async(
-      num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
+      num_lut * glwe_size * sizeof(Torus), stream, gpu_index);
   Torus *d_buffer2 = (Torus *)cuda_malloc_async(
-      num_lut * glwe_size * sizeof(Torus), *stream, gpu_index);
+      num_lut * glwe_size * sizeof(Torus), stream, gpu_index);
 
   checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
                                   num_lut * glwe_size * sizeof(Torus),
@@ -368,11 +368,11 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
   checkCudaErrors(cudaStreamSynchronize(*stream));
 
   // Free memory
-  cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
-  cuda_drop_async(d_buffer1, *stream, gpu_index);
-  cuda_drop_async(d_buffer2, *stream, gpu_index);
+  cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
+  cuda_drop_async(d_buffer1, stream, gpu_index);
+  cuda_drop_async(d_buffer2, stream, gpu_index);
   if (max_shared_memory < memory_needed_per_block)
-    cuda_drop_async(d_mem, *stream, gpu_index);
+    cuda_drop_async(d_mem, stream, gpu_index);
 }
 
 // only works for big lwe for ks+bs case
@@ -723,7 +723,7 @@ void host_blind_rotate_and_sample_extraction(
 
   char *d_mem;
   if (max_shared_memory < memory_needed_per_block)
-    d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, *stream,
+    d_mem = (char *)cuda_malloc_async(memory_needed_per_block * tau, stream,
                                       gpu_index);
   else {
     checkCudaErrors(cudaFuncSetAttribute(
@@ -740,11 +740,11 @@ void host_blind_rotate_and_sample_extraction(
   int ggsw_size =
       polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
   double2 *d_ggsw_fft_in = (double2 *)cuda_malloc_async(
-      mbr_size * ggsw_size * sizeof(double), *stream, gpu_index);
+      mbr_size * ggsw_size * sizeof(double), stream, gpu_index);
 
   batch_fft_ggsw_vector<Torus, STorus, params>(
-      v_stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension,
-      polynomial_size, l_gadget, gpu_index, max_shared_memory);
+      stream, d_ggsw_fft_in, ggsw_in, mbr_size, glwe_dimension, polynomial_size,
+      l_gadget, gpu_index, max_shared_memory);
   checkCudaErrors(cudaGetLastError());
 
   //
@@ -768,9 +768,9 @@ void host_blind_rotate_and_sample_extraction(
   checkCudaErrors(cudaGetLastError());
 
   //
-  cuda_drop_async(d_ggsw_fft_in, *stream, gpu_index);
+  cuda_drop_async(d_ggsw_fft_in, stream, gpu_index);
   if (max_shared_memory < memory_needed_per_block)
-    cuda_drop_async(d_mem, *stream, gpu_index);
+    cuda_drop_async(d_mem, stream, gpu_index);
 }
 
 template <typename Torus, class params>
@@ -852,25 +852,25 @@ __host__ void host_circuit_bootstrap_vertical_packing(
   int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
                   (glwe_dimension + 1) * polynomial_size;
   Torus *ggsw_out = (Torus *)cuda_malloc_async(
-      number_of_inputs * ggsw_size * sizeof(Torus), *stream, gpu_index);
+      number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index);
   // input lwe array for fp-ks
   Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async(
       number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
           (polynomial_size + 1) * sizeof(Torus),
-      *stream, gpu_index);
+      stream, gpu_index);
   // buffer for pbs output
   Torus *lwe_array_out_pbs_buffer =
       (Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs *
                                      (polynomial_size + 1) * sizeof(Torus),
-                                 *stream, gpu_index);
+                                 stream, gpu_index);
   // vector for shifted lwe input
   Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async(
       number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus),
-      *stream, gpu_index);
+      stream, gpu_index);
   // lut vector buffer for cbs
   Torus *lut_vector_cbs = (Torus *)cuda_malloc_async(
       level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      *stream, gpu_index);
+      stream, gpu_index);
   // indexes of lut vectors for cbs
   uint32_t *h_lut_vector_indexes =
       (uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t));
@@ -878,12 +878,10 @@ __host__ void host_circuit_bootstrap_vertical_packing(
     h_lut_vector_indexes[index] = index % level_count_cbs;
   }
   uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async(
-      number_of_inputs * level_count_cbs * sizeof(uint32_t), *stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
-                           number_of_inputs * level_count_cbs *
-                               sizeof(uint32_t),
-                           v_stream, gpu_index);
+      number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
+  cuda_memcpy_async_to_gpu(
+      lut_vector_indexes, h_lut_vector_indexes,
+      number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index);
   checkCudaErrors(cudaGetLastError());
 
   uint32_t bits = sizeof(Torus) * 8;
@@ -898,17 +896,17 @@ __host__ void host_circuit_bootstrap_vertical_packing(
       base_log_cbs, number_of_inputs, max_shared_memory);
   checkCudaErrors(cudaGetLastError());
   // Free memory
-  cuda_drop_async(lwe_array_in_fp_ks_buffer, *stream, gpu_index);
-  cuda_drop_async(lwe_array_in_shifted_buffer, *stream, gpu_index);
-  cuda_drop_async(lwe_array_out_pbs_buffer, *stream, gpu_index);
-  cuda_drop_async(lut_vector_cbs, *stream, gpu_index);
-  cuda_drop_async(lut_vector_indexes, *stream, gpu_index);
+  cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index);
+  cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index);
+  cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index);
+  cuda_drop_async(lut_vector_cbs, stream, gpu_index);
+  cuda_drop_async(lut_vector_indexes, stream, gpu_index);
   free(h_lut_vector_indexes);
 
   // we need to expand the lut to fill the masks with zeros
   Torus *lut_vector_glwe = (Torus *)cuda_malloc_async(
       (glwe_dimension + 1) * lut_number * polynomial_size * sizeof(Torus),
-      *stream, gpu_index);
+      stream, gpu_index);
   int num_blocks = 0, num_threads = 0;
   int num_entries = glwe_dimension * polynomial_size * lut_number;
   getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -928,7 +926,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
                         (ptrdiff_t)(i * (glwe_dimension + 1) * polynomial_size);
       // CMUX Tree
       Torus *glwe_array_out = (Torus *)cuda_malloc_async(
-          (glwe_dimension + 1) * polynomial_size * sizeof(Torus), *stream,
+          (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
           gpu_index);
       checkCudaErrors(cudaGetLastError());
       // r = tau * p - log2(N)
@@ -947,7 +945,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
           number_of_inputs - r, 1, glwe_dimension, polynomial_size,
           base_log_cbs, level_count_cbs, max_shared_memory);
 
-      cuda_drop_async(glwe_array_out, *stream, gpu_index);
+      cuda_drop_async(glwe_array_out, stream, gpu_index);
     }
 
   } else {
@@ -963,7 +961,7 @@ __host__ void host_circuit_bootstrap_vertical_packing(
           max_shared_memory);
     }
   }
-  cuda_drop_async(ggsw_out, *stream, gpu_index);
+  cuda_drop_async(ggsw_out, stream, gpu_index);
 }
 
 #endif // WOP_PBS_H
diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh
index 21b00141f..f26a4c2a7 100644
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -117,7 +117,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
   switch (polynomial_size) {
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       checkCudaErrors(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -129,14 +129,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
                                                                  buffer);
     } else {
       buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, *stream, gpu_index);
+          shared_memory_size * total_polynomials, stream, gpu_index);
       batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, NOSM>
           <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
     }
     break;
   case 1024:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       checkCudaErrors(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -148,14 +148,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
                                                                  buffer);
     } else {
       buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, *stream, gpu_index);
+          shared_memory_size * total_polynomials, stream, gpu_index);
       batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, NOSM>
           <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
     }
     break;
   case 2048:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       checkCudaErrors(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -167,14 +167,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
                                                                  buffer);
     } else {
       buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, *stream, gpu_index);
+          shared_memory_size * total_polynomials, stream, gpu_index);
       batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, NOSM>
           <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
     }
     break;
   case 4096:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       checkCudaErrors(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -186,14 +186,14 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
                                                                  buffer);
     } else {
       buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, *stream, gpu_index);
+          shared_memory_size * total_polynomials, stream, gpu_index);
       batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, NOSM>
           <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
     }
     break;
   case 8192:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       checkCudaErrors(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -205,7 +205,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
                                                                  buffer);
     } else {
       buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, *stream, gpu_index);
+          shared_memory_size * total_polynomials, stream, gpu_index);
       batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, NOSM>
           <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
     }
@@ -214,8 +214,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
     break;
   }
 
-  cuda_drop_async(d_bsk, *stream, gpu_index);
-  cuda_drop_async(buffer, *stream, gpu_index);
+  cuda_drop_async(d_bsk, stream, gpu_index);
+  cuda_drop_async(buffer, stream, gpu_index);
   free(h_bsk);
 }
 
diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh
index 4f5e70825..d258b1e8e 100644
--- a/src/crypto/ggsw.cuh
+++ b/src/crypto/ggsw.cuh
@@ -49,12 +49,10 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
  * global memory
  */
 template <typename T, typename ST, class params>
-void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
-                           uint32_t glwe_dim, uint32_t polynomial_size,
-                           uint32_t level_count, uint32_t gpu_index,
-                           uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
+void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
+                           uint32_t r, uint32_t glwe_dim,
+                           uint32_t polynomial_size, uint32_t level_count,
+                           uint32_t gpu_index, uint32_t max_shared_memory) {
 
   int shared_memory_size = sizeof(double) * polynomial_size;
 
@@ -63,11 +61,11 @@ void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
 
   char *d_mem;
   if (max_shared_memory < shared_memory_size) {
-    d_mem = (char *)cuda_malloc_async(shared_memory_size, *stream, gpu_index);
+    d_mem = (char *)cuda_malloc_async(shared_memory_size, stream, gpu_index);
     device_batch_fft_ggsw_vector<T, ST, params, NOSM>
         <<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
     checkCudaErrors(cudaGetLastError());
-    cuda_drop_async(d_mem, *stream, gpu_index);
+    cuda_drop_async(d_mem, stream, gpu_index);
   } else {
     device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
         <<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
diff --git a/src/device.cu b/src/device.cu
index e77fbf9e2..d890a49ca 100644
--- a/src/device.cu
+++ b/src/device.cu
@@ -5,7 +5,7 @@
 #include <helper_cuda.h>
 
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
-void *cuda_create_stream(uint32_t gpu_index) {
+cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
   cudaSetDevice(gpu_index);
   cudaStream_t *stream = new cudaStream_t;
   cudaStreamCreate(stream);
@@ -13,9 +13,8 @@ void *cuda_create_stream(uint32_t gpu_index) {
 }
 
 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
-int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
+int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
   cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
   cudaStreamDestroy(*stream);
   return 0;
 }
@@ -33,7 +32,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
+void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
                         uint32_t gpu_index) {
   void *ptr;
 
@@ -42,7 +41,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
       &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
 
   if (support_async_alloc)
-    checkCudaErrors(cudaMallocAsync((void **)&ptr, size, stream));
+    checkCudaErrors(cudaMallocAsync((void **)&ptr, size, *stream));
   else
     checkCudaErrors(cudaMalloc((void **)&ptr, size));
   return ptr;
@@ -74,7 +73,7 @@ int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
 /// -2: error, gpu index doesn't exist
 /// -3: error, zero copy size
 int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                             void *v_stream, uint32_t gpu_index) {
+                             cudaStream_t *stream, uint32_t gpu_index) {
   if (size == 0) {
     // error code: zero copy size
     return -3;
@@ -90,7 +89,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
     // error code: invalid device pointer
     return -1;
   }
-  auto stream = static_cast<cudaStream_t *>(v_stream);
+
   cudaSetDevice(gpu_index);
   checkCudaErrors(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
@@ -116,7 +115,7 @@ int cuda_synchronize_device(uint32_t gpu_index) {
 /// -2: error, gpu index doesn't exist
 /// -3: error, zero copy size
 int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                             void *v_stream, uint32_t gpu_index) {
+                             cudaStream_t *stream, uint32_t gpu_index) {
   if (size == 0) {
     // error code: zero copy size
     return -3;
@@ -132,7 +131,7 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
     // error code: invalid device pointer
     return -1;
   }
-  auto stream = static_cast<cudaStream_t *>(v_stream);
+
   cudaSetDevice(gpu_index);
   checkCudaErrors(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
@@ -158,14 +157,14 @@ int cuda_drop(void *ptr, uint32_t gpu_index) {
 }
 
 /// Drop a cuda array. Tries to do it asynchronously
-int cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {
+int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
 
   int support_async_alloc;
   checkCudaErrors(cudaDeviceGetAttribute(
       &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
 
   if (support_async_alloc)
-    checkCudaErrors(cudaFreeAsync(ptr, stream));
+    checkCudaErrors(cudaFreeAsync(ptr, *stream));
   else
     checkCudaErrors(cudaFree(ptr));
   return 0;