diff --git a/.github/workflows/concrete_cuda_benchmark.yml b/.github/workflows/concrete_cuda_benchmark.yml
index 3070f8dcb..4edc2e9e6 100644
--- a/.github/workflows/concrete_cuda_benchmark.yml
+++ b/.github/workflows/concrete_cuda_benchmark.yml
@@ -80,8 +80,7 @@ jobs:
       - name: Benchmark concrete-cuda
         if: ${{ !cancelled() }}
         run: |
-          ${{ env.BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json 
-          --benchmark_out_format=json
+          ${{ env.BENCHMARK_DIR }}/benchmark_concrete_cuda --benchmark_out=benchmarks_results.json --benchmark_out_format=json
 
       - name: Upload raw results artifact
         uses: actions/upload-artifact@v3
diff --git a/backends/concrete-cuda/implementation/include/bootstrap.h b/backends/concrete-cuda/implementation/include/bootstrap.h
index c8e584000..d85170555 100644
--- a/backends/concrete-cuda/implementation/include/bootstrap.h
+++ b/backends/concrete-cuda/implementation/include/bootstrap.h
@@ -56,6 +56,12 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
 void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
                                       int8_t **pbs_buffer);
 
+bool verify_cuda_bootstrap_low_latency_grid_size_64(int glwe_dimension,
+                                                    int polynomial_size,
+                                                    int level_count,
+                                                    int num_samples,
+                                                    uint32_t max_shared_memory);
+
 void scratch_cuda_bootstrap_low_latency_32(
     void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
index c05caeb13..42e1e07dd 100644
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -11,6 +11,55 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
       max_shared_memory);
 }
 
+// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
+// factor 4 being related to the occupancy of 50%).
+bool verify_cuda_bootstrap_low_latency_grid_size_64(
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+
+  int ret;
+  switch (polynomial_size) {
+  case 256:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<256>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  case 512:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<512>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  case 1024:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<1024>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  case 2048:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<2048>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  case 4096:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<4096>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  case 8192:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<8192>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  case 16384:
+    ret = verify_cuda_bootstrap_low_latency_grid_size<uint64_t, Degree<16384>>(
+        glwe_dimension, polynomial_size, level_count, num_samples,
+        max_shared_memory);
+    break;
+  default:
+    break;
+  }
+
+  return ret;
+}
 /*
  * Runs standard checks to validate the inputs
  */
@@ -25,16 +74,14 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
           polynomial_size == 4096 || polynomial_size == 8192 ||
           polynomial_size == 16384));
   // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
-  // factor 4 being related to the occupancy of 50%). The only supported
-  // value for k is 1, so k + 1 = 2 for now.
+  // factor 4 being related to the occupancy of 50%).
   int number_of_sm = 0;
   cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(
-      ("Error (GPU low latency PBS): the number of input LWEs must be lower "
-       "or equal to the number of streaming multiprocessors on the device "
-       "divided by 4 * "
-       "(k + 1) * level_count",
-       num_samples <= number_of_sm * 4. / (glwe_dimension + 1) / level_count));
+  assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
+          "or equal to the number of streaming multiprocessors on the device "
+          "divided by 4 * (k + 1) * level_count",
+          verify_cuda_bootstrap_low_latency_grid_size(
+              glwe_dimension, level_count, num_samples)));
 }
 
 /*
@@ -43,7 +90,6 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
 void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
                                   int level_count, int base_log,
                                   int polynomial_size, int num_samples) {
-
   assert(("Error (GPU low latency PBS): base log should be <= nbits",
           base_log <= nbits));
   checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
diff --git a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh
index 17fb66a6a..9951c5913 100644
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh
@@ -398,4 +398,44 @@ __host__ void host_bootstrap_low_latency(
   check_cuda_error(cudaGetLastError());
 }
 
+// Verify if the grid size for the low latency kernel satisfies the cooperative
+// group constraints
+template <typename Torus, class params>
+__host__ bool verify_cuda_bootstrap_low_latency_grid_size(
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+  // Calculate the dimension of the kernel
+  uint64_t full_sm =
+      get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
+
+  int thds = polynomial_size / params::opt;
+
+  // Get the maximum number of active blocks per streaming multiprocessors
+  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
+  int max_active_blocks_per_sm;
+
+  if (max_shared_memory < partial_sm) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_bootstrap_low_latency<Torus, params, NOSM>, thds, 0);
+  } else if (max_shared_memory < full_sm) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_bootstrap_low_latency<Torus, params, PARTIALSM>, thds,
+        0);
+  } else {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_bootstrap_low_latency<Torus, params, FULLSM>, thds, 0);
+  }
+
+  // Get the number of streaming multiprocessors
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
+}
+
 #endif // LOWLAT_PBS_H
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
index ef288dbaf..32d4812ef 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
@@ -72,7 +72,7 @@ public:
         gpu_index);
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     bit_extraction_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
                             d_fourier_bsk, d_ksk, plaintexts, d_lwe_ct_in_array,
                             d_lwe_ct_out_array, bit_extract_buffer, gpu_index);
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
index a527e5eb0..55c843fe7 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
@@ -42,6 +42,7 @@ protected:
 
 public:
   void SetUp(const ::benchmark::State &state) {
+    cudaDeviceSynchronize();
     stream = cuda_create_stream(0);
 
     lwe_dimension = state.range(0);
@@ -64,12 +65,13 @@ public:
         (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint64_t));
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
                        d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
                        d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
                        gpu_index);
     free(lwe_ct_array);
+    cudaDeviceSynchronize();
     cudaDeviceReset();
   }
 };
@@ -151,12 +153,12 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
   uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       cuda_get_max_shared_memory(gpu_index));
+
   if (buffer_size > free)
     st.SkipWithError("Not enough free memory in the device. Skipping...");
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  if (input_lwe_ciphertext_count >
-      number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
+  if (!verify_cuda_bootstrap_low_latency_grid_size_64(
+          glwe_dimension, polynomial_size, pbs_level,
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)))
     st.SkipWithError(
         "Not enough SM on device to run this configuration. Skipping...");
 
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
index 750c87f5a..0efa08f3c 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
@@ -79,7 +79,7 @@ public:
         &delta, number_of_inputs, 1, 1, gpu_index);
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     circuit_bootstrap_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
                                d_fourier_bsk, d_pksk, plaintexts,
                                d_lwe_ct_in_array, d_lut_vector_indexes,
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
index 75ebb8c1c..4e4d7b5d2 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
@@ -58,7 +58,7 @@ public:
                     gpu_index);
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
                        &d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
                        gpu_index);
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
index fc9f38277..27ef4c080 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
@@ -54,7 +54,7 @@ public:
                     &delta, number_of_inputs, 1, 1, gpu_index);
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
                        d_ksk_array, plaintexts, d_lwe_in_ct_array,
                        d_lwe_out_ct_array, gpu_index);
@@ -77,6 +77,8 @@ BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
 (benchmark::State &st) {
   uint64_t *lwe_in_ct = (uint64_t *)malloc(
       number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
+  uint64_t *lwe_out_ct = (uint64_t *)malloc(
+      number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t));
   void *v_stream = (void *)stream;
   for (auto _ : st) {
     cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct,
@@ -88,13 +90,14 @@ BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
         stream, gpu_index, (void *)d_lwe_out_ct_array,
         (void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
         output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
-    cuda_memcpy_async_to_cpu(lwe_in_ct, d_lwe_out_ct_array,
+    cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct_array,
                              number_of_inputs * (output_lwe_dimension + 1) *
                                  sizeof(uint64_t),
                              stream, gpu_index);
     cuda_synchronize_stream(v_stream);
   }
   free(lwe_in_ct);
+  free(lwe_out_ct);
 }
 
 static void
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
index d8f68da30..5f1ba10b8 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
@@ -54,7 +54,7 @@ public:
         noise_variance, payload_modulus, delta, num_samples, 1, 1, gpu_index);
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     linear_algebra_teardown(
         stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
         &d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
index 28f67b2f2..9fcf7562b 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
@@ -88,6 +88,8 @@ public:
     // We keep the following for the benchmarks with copies
     lwe_ct_in_array = (uint64_t *)malloc(
         (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
+    lwe_ct_out_array = (uint64_t *)malloc(
+        (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
     for (int i = 0; i < tau; i++) {
       uint64_t plaintext = plaintexts[i];
       uint64_t *lwe_ct_in =
@@ -97,11 +99,9 @@ public:
           lwe_sk_in, lwe_ct_in, plaintext, glwe_dimension * polynomial_size,
           lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
     }
-    lwe_ct_out_array = (uint64_t *)malloc(
-        (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
   }
 
-  void TearDown() {
+  void TearDown(const ::benchmark::State &state) {
     wop_pbs_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_ksk,
                      d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array,
                      d_lut_vector, d_lwe_ct_out_array, wop_pbs_buffer,
diff --git a/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp b/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
index 4af43829e..8e6d10940 100644
--- a/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
@@ -15,7 +15,6 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
                      int number_of_inputs, int repetitions, int samples,
                      int gpu_index) {
 
-  void *v_stream = (void *)stream;
   *payload_modulus = message_modulus * carry_modulus;
   // Value of the shift we multiply our messages by
   *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
@@ -92,7 +91,7 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
                                (lwe_dimension + 1) * sizeof(uint64_t),
                            stream, gpu_index);
 
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
 
   free(lwe_ct_in_array);
   free(lut_pbs_identity);
@@ -105,21 +104,21 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
                         uint64_t *d_lut_pbs_indexes,
                         uint64_t *d_lwe_ct_in_array,
                         uint64_t *d_lwe_ct_out_array, int gpu_index) {
-  void *v_stream = (void *)stream;
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
 
   concrete_cpu_destroy_concrete_csprng(csprng);
   free(csprng);
   free(lwe_sk_in_array);
   free(lwe_sk_out_array);
   free(plaintexts);
+
   cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
   cuda_drop_async(d_lut_pbs_identity, stream, gpu_index);
   cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
@@ -137,7 +136,6 @@ void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
   // Value of the shift we multiply our messages by
   *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
 
-  void *v_stream = (void *)stream;
   // Create a Csprng
   *csprng =
       (Csprng *)aligned_alloc(CONCRETE_CSPRNG_ALIGN, CONCRETE_CSPRNG_SIZE);
@@ -189,7 +187,7 @@ void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
                            repetitions * samples * number_of_inputs *
                                (input_lwe_dimension + 1) * sizeof(uint64_t),
                            stream, gpu_index);
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
   free(lwe_ct_in_array);
 }
 
@@ -198,18 +196,19 @@ void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
                         uint64_t *d_ksk_array, uint64_t *plaintexts,
                         uint64_t *d_lwe_ct_in_array,
                         uint64_t *d_lwe_ct_out_array, int gpu_index) {
-  void *v_stream = (void *)stream;
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
+
   concrete_cpu_destroy_concrete_csprng(csprng);
   free(csprng);
   free(lwe_sk_in_array);
   free(lwe_sk_out_array);
   free(plaintexts);
+
   cuda_drop_async(d_ksk_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
@@ -312,8 +311,7 @@ void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
                                sizeof(uint64_t),
                            stream, gpu_index);
 
-  void *v_stream = (void *)stream;
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
   free(cleartext_2);
 }
 
@@ -324,24 +322,25 @@ void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
                              uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
                              uint64_t **plaintexts_2, uint64_t **d_plaintexts_2,
                              uint64_t **d_cleartext_2, int gpu_index) {
-  void *v_stream = (void *)stream;
 
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
+
   concrete_cpu_destroy_concrete_csprng(*csprng);
   free(*csprng);
-  cuda_drop_async(*d_lwe_in_1_ct, stream, gpu_index);
-  cuda_drop_async(*d_lwe_in_2_ct, stream, gpu_index);
-  cuda_drop_async(*d_plaintexts_2, stream, gpu_index);
-  cuda_drop_async(*d_cleartext_2, stream, gpu_index);
-  cuda_drop_async(*d_lwe_out_ct, stream, gpu_index);
   free(*lwe_out_ct);
   free(*lwe_sk_array);
   free(*plaintexts_1);
   free(*plaintexts_2);
   free(*lwe_in_1_ct);
   free(*lwe_in_2_ct);
+
+  cuda_drop_async(*d_lwe_in_1_ct, stream, gpu_index);
+  cuda_drop_async(*d_lwe_in_2_ct, stream, gpu_index);
+  cuda_drop_async(*d_plaintexts_2, stream, gpu_index);
+  cuda_drop_async(*d_cleartext_2, stream, gpu_index);
+  cuda_drop_async(*d_lwe_out_ct, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void bit_extraction_setup(
@@ -355,7 +354,6 @@ void bit_extraction_setup(
     int number_of_bits_of_message_including_padding,
     int number_of_bits_to_extract, int *delta_log, uint64_t *delta,
     int number_of_inputs, int repetitions, int samples, int gpu_index) {
-  void *v_stream = (void *)stream;
   *delta_log = 64 - number_of_bits_of_message_including_padding;
   *delta = (uint64_t)(1) << *delta_log;
 
@@ -428,7 +426,7 @@ void bit_extraction_setup(
                                pbs_level, number_of_inputs,
                                cuda_get_max_shared_memory(gpu_index), true);
 
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
   free(lwe_ct_in_array);
 }
 
@@ -439,20 +437,21 @@ void bit_extraction_teardown(cudaStream_t *stream, Csprng *csprng,
                              uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
                              uint64_t *d_lwe_ct_out_array,
                              int8_t *bit_extract_buffer, int gpu_index) {
-  void *v_stream = (void *)stream;
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
+
   concrete_cpu_destroy_concrete_csprng(csprng);
   free(csprng);
   free(lwe_sk_in_array);
   free(lwe_sk_out_array);
   free(plaintexts);
+
   cleanup_cuda_extract_bits(stream, gpu_index, &bit_extract_buffer);
   cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
   cuda_drop_async(d_ksk_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void circuit_bootstrap_setup(
@@ -468,7 +467,6 @@ void circuit_bootstrap_setup(
     int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
     int samples, int gpu_index) {
 
-  void *v_stream = (void *)stream;
   *delta_log = 60;
 
   *delta = (uint64_t)(1) << *delta_log;
@@ -547,7 +545,7 @@ void circuit_bootstrap_setup(
   cuda_memcpy_async_to_gpu(*d_lut_vector_indexes, h_lut_vector_indexes,
                            number_of_inputs * cbs_level * sizeof(uint64_t),
                            stream, gpu_index);
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
   free(h_lut_vector_indexes);
   free(lwe_ct_in_array);
 }
@@ -558,21 +556,23 @@ void circuit_bootstrap_teardown(
     uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
     uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
     int8_t *cbs_buffer, int gpu_index) {
-  void *v_stream = (void *)stream;
-  cuda_synchronize_stream(v_stream);
+
+  cuda_synchronize_stream(stream);
+
   concrete_cpu_destroy_concrete_csprng(csprng);
   free(csprng);
   free(lwe_sk_in_array);
   free(lwe_sk_out_array);
   free(plaintexts);
+
   cleanup_cuda_circuit_bootstrap(stream, gpu_index, &cbs_buffer);
   cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
   cuda_drop_async(d_pksk_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
   cuda_drop_async(d_ggsw_ct_out_array, stream, gpu_index);
   cuda_drop_async(d_lut_vector_indexes, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
@@ -583,7 +583,6 @@ void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
                      double glwe_modular_variance, int r_lut, int tau,
                      uint64_t delta_log, int repetitions, int samples,
                      int gpu_index) {
-  void *v_stream = (void *)stream;
   int ggsw_size = polynomial_size * (glwe_dimension + 1) *
                   (glwe_dimension + 1) * level_count;
   int glwe_size = (glwe_dimension + 1) * polynomial_size;
@@ -647,7 +646,7 @@ void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
   scratch_cuda_cmux_tree_64(stream, gpu_index, cmux_tree_buffer, glwe_dimension,
                             polynomial_size, level_count, r_lut, tau,
                             cuda_get_max_shared_memory(gpu_index), true);
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
   free(lut_cmux_tree_identity);
   free(ggsw_bit_array);
 }
@@ -657,16 +656,18 @@ void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
                         int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
                         int gpu_index) {
   cuda_synchronize_stream(stream);
+
   concrete_cpu_destroy_concrete_csprng(*csprng);
   free(*plaintexts);
   free(*csprng);
   free(*glwe_sk);
+
+  cleanup_cuda_cmux_tree(stream, gpu_index, cmux_tree_buffer);
   cuda_drop_async(*d_lut_identity, stream, gpu_index);
   cuda_drop_async(*d_ggsw_bit_array, stream, gpu_index);
   cuda_drop_async(*d_glwe_out, stream, gpu_index);
-  cleanup_cuda_cmux_tree(stream, gpu_index, cmux_tree_buffer);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
@@ -683,7 +684,6 @@ void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
                    int *delta_log_lut, uint64_t *delta, int tau,
                    int repetitions, int samples, int gpu_index) {
 
-  void *v_stream = (void *)stream;
   int input_lwe_dimension = glwe_dimension * polynomial_size;
   *delta_log = 64 - p;
   *delta_log_lut = *delta_log;
@@ -773,7 +773,7 @@ void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
                           cbs_level, pbs_level, p, p, tau,
                           cuda_get_max_shared_memory(gpu_index), true);
 
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
   free(lwe_ct_in_array);
   free(big_lut);
 }
@@ -785,23 +785,23 @@ void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
                       uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
                       uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
                       int gpu_index) {
-  void *v_stream = (void *)stream;
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
+
   concrete_cpu_destroy_concrete_csprng(csprng);
   free(csprng);
   free(lwe_sk_in_array);
   free(lwe_sk_out_array);
   free(plaintexts);
-  cleanup_cuda_circuit_bootstrap_vertical_packing(stream, gpu_index,
-                                                  &wop_pbs_buffer);
+
+  cleanup_cuda_wop_pbs(stream, gpu_index, &wop_pbs_buffer);
   cuda_drop_async(d_fourier_bsk_array, stream, gpu_index);
   cuda_drop_async(d_ksk_array, stream, gpu_index);
   cuda_drop_async(d_pksk_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
   cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
   cuda_drop_async(d_lut_vector, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }
 
 void fft_setup(cudaStream_t *stream, double **_poly1, double **_poly2,
@@ -816,7 +816,6 @@ void fft_setup(cudaStream_t *stream, double **_poly1, double **_poly2,
   auto &d_cpoly1 = *_d_cpoly1;
   auto &d_cpoly2 = *_d_cpoly2;
 
-  void *v_stream = (void *)stream;
   poly1 = (double *)malloc(polynomial_size * samples * sizeof(double));
   poly2 = (double *)malloc(polynomial_size * samples * sizeof(double));
   h_cpoly1 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
@@ -859,18 +858,21 @@ void fft_setup(cudaStream_t *stream, double **_poly1, double **_poly2,
   cuda_memcpy_async_to_gpu(d_cpoly2, h_cpoly2,
                            polynomial_size / 2 * samples * sizeof(double2),
                            stream, gpu_index);
-  cuda_synchronize_stream(v_stream);
+  cuda_synchronize_stream(stream);
 }
 
 void fft_teardown(cudaStream_t *stream, double *poly1, double *poly2,
                   double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
                   double2 *d_cpoly2, int gpu_index) {
+  cuda_synchronize_stream(stream);
+
   free(poly1);
   free(poly2);
   free(h_cpoly1);
   free(h_cpoly2);
+
   cuda_drop_async(d_cpoly1, stream, gpu_index);
   cuda_drop_async(d_cpoly2, stream, gpu_index);
+  cuda_synchronize_stream(stream);
   cuda_destroy_stream(stream, gpu_index);
-  cudaDeviceSynchronize();
 }