chore(backend/cuda): reduces memory consumption in the bootstrap benchmark

2026-04-17 03:00:54 -04:00 · 2023-03-28 18:19:02 -03:00
parent d9652b8936
commit c236dffdd8
7 changed files with 141 additions and 67 deletions
--- a/backends/concrete-cuda/implementation/include/bootstrap.h
+++ b/backends/concrete-cuda/implementation/include/bootstrap.h
@@ -147,6 +147,14 @@ void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
 void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
                                                     uint32_t gpu_index,
                                                     int8_t **cbs_vp_buffer);
+
+uint64_t get_buffer_size_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+
+uint64_t get_buffer_size_bootstrap_low_latency_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
 }

 #ifdef __CUDACC__
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
@@ -1,5 +1,16 @@
 #include "bootstrap_amortized.cuh"

+/*
+ * Returns the buffer size for 64 bits executions
+ */
+uint64_t get_buffer_size_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+  return get_buffer_size_bootstrap_amortized<uint64_t>(
+      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+      max_shared_memory);
+}
+
 /*
 * Runs standard checks to validate the inputs
 */
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -1,5 +1,16 @@
 #include "bootstrap_low_latency.cuh"

+/*
+ * Returns the buffer size for 64 bits executions
+ */
+uint64_t get_buffer_size_bootstrap_low_latency_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+  return get_buffer_size_bootstrap_low_latency<uint64_t>(
+      glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count,
+      max_shared_memory);
+}
+
 /*
 * Runs standard checks to validate the inputs
 */
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bootstrap.cpp
@@ -39,8 +39,6 @@ protected:
  Csprng *csprng;
  cudaStream_t *stream;
  int gpu_index = 0;
-  int8_t *amortized_pbs_buffer;
-  int8_t *lowlat_pbs_buffer;

 public:
  void SetUp(const ::benchmark::State &state) {
@@ -56,11 +54,10 @@ public:
    bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
                    &d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
                    &d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
-                    &amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
-                    glwe_dimension, polynomial_size, lwe_modular_variance,
-                    glwe_modular_variance, pbs_base_log, pbs_level,
-                    message_modulus, carry_modulus, &payload_modulus, &delta,
-                    input_lwe_ciphertext_count, 1, 1, gpu_index);
+                    lwe_dimension, glwe_dimension, polynomial_size,
+                    lwe_modular_variance, glwe_modular_variance, pbs_base_log,
+                    pbs_level, message_modulus, carry_modulus, &payload_modulus,
+                    &delta, input_lwe_ciphertext_count, 1, 1, gpu_index);

    // We keep the following for the benchmarks with copies
    lwe_ct_array = (uint64_t *)malloc(
@@ -71,31 +68,56 @@ public:
    bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
                       d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
                       d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
-                       amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
+                       gpu_index);
    free(lwe_ct_array);
+    cudaDeviceReset();
  }
 };

 BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_AmortizedPBS)
 (benchmark::State &st) {
-  void *v_stream = (void *)stream;
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  uint64_t buffer_size = get_buffer_size_bootstrap_amortized_64(
+      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(gpu_index));
+  if (buffer_size > free)
+    st.SkipWithError("Not enough free memory in the device. Skipping...");
+
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_amortized_64(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);

  for (auto _ : st) {
    // Execute PBS
    cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        stream, gpu_index, (void *)d_lwe_ct_out_array,
        (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
-        amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count,
-        input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
-    cuda_synchronize_stream(v_stream);
+        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        input_lwe_ciphertext_count, input_lwe_ciphertext_count, 0,
+        cuda_get_max_shared_memory(gpu_index));
+    cuda_synchronize_stream(stream);
  }
+  cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
+  cuda_synchronize_stream(stream);
 }

 BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
 (benchmark::State &st) {
-  void *v_stream = (void *)stream;
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  uint64_t buffer_size = get_buffer_size_bootstrap_amortized_64(
+      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(gpu_index));
+  if (buffer_size > free)
+    st.SkipWithError("Not enough free memory in the device. Skipping...");
+
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_amortized_64(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);

  for (auto _ : st) {
    cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
@@ -107,37 +129,66 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusAmortizedPBS)
    cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        stream, gpu_index, (void *)d_lwe_ct_out_array,
        (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
-        amortized_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count,
-        input_lwe_ciphertext_count, 0, cuda_get_max_shared_memory(gpu_index));
+        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        input_lwe_ciphertext_count, input_lwe_ciphertext_count, 0,
+        cuda_get_max_shared_memory(gpu_index));

    cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
                             (lwe_dimension + 1) * input_lwe_ciphertext_count *
                                 sizeof(uint64_t),
                             stream, gpu_index);
-    cuda_synchronize_stream(v_stream);
+    cuda_synchronize_stream(stream);
  }
+  cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
+  cuda_synchronize_stream(stream);
 }

 BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
 (benchmark::State &st) {
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
+      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(gpu_index));
+  if (buffer_size > free)
+    st.SkipWithError("Not enough free memory in the device. Skipping...");
+
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_low_latency_64(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(gpu_index), true);
+
  for (auto _ : st) {
    // Execute PBS
    cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
        stream, gpu_index, (void *)d_lwe_ct_out_array,
        (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
-        lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, 1, 1, 0,
-        cuda_get_max_shared_memory(gpu_index));
+        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        1, 1, 0, cuda_get_max_shared_memory(gpu_index));
    cuda_synchronize_stream(stream);
  }
+  cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
+  cuda_synchronize_stream(stream);
 }

 BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
 (benchmark::State &st) {
-  void *v_stream = (void *)stream;
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
+      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(gpu_index));
+  if (buffer_size > free)
+    st.SkipWithError("Not enough free memory in the device. Skipping...");
+
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_low_latency_64(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(gpu_index), true);

  for (auto _ : st) {
    cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_array,
@@ -148,17 +199,18 @@ BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_CopiesPlusLowLatencyPBS)
    cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
        stream, gpu_index, (void *)d_lwe_ct_out_array,
        (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array,
-        lowlat_pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, 1, 1, 0,
-        cuda_get_max_shared_memory(gpu_index));
+        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk_array, pbs_buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        1, 1, 0, cuda_get_max_shared_memory(gpu_index));

    cuda_memcpy_async_to_cpu(lwe_ct_array, d_lwe_ct_out_array,
                             (lwe_dimension + 1) * input_lwe_ciphertext_count *
                                 sizeof(uint64_t),
                             stream, gpu_index);
-    cuda_synchronize_stream(v_stream);
+    cuda_synchronize_stream(stream);
  }
+  cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
+  cuda_synchronize_stream(stream);
 }

 static void
--- a/backends/concrete-cuda/implementation/test_and_benchmark/include/setup_and_teardown.h
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/include/setup_and_teardown.h
@@ -16,9 +16,8 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
                     double **d_fourier_bsk_array, uint64_t **plaintexts,
                     uint64_t **d_lut_pbs_identity,
                     uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
-                     uint64_t **d_lwe_ct_out_array,
-                     int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
-                     int lwe_dimension, int glwe_dimension, int polynomial_size,
+                     uint64_t **d_lwe_ct_out_array, int lwe_dimension,
+                     int glwe_dimension, int polynomial_size,
                     double lwe_modular_variance, double glwe_modular_variance,
                     int pbs_base_log, int pbs_level, int message_modulus,
                     int carry_modulus, int *payload_modulus, uint64_t *delta,
@@ -30,9 +29,7 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
                        uint64_t *d_lut_pbs_identity,
                        uint64_t *d_lut_pbs_indexes,
                        uint64_t *d_lwe_ct_in_array,
-                        uint64_t *d_lwe_ct_out_array,
-                        int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
-                        int gpu_index);
+                        uint64_t *d_lwe_ct_out_array, int gpu_index);

 void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
                     uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
--- a/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
@@ -7,9 +7,8 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
                     double **d_fourier_bsk_array, uint64_t **plaintexts,
                     uint64_t **d_lut_pbs_identity,
                     uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
-                     uint64_t **d_lwe_ct_out_array,
-                     int8_t **amortized_pbs_buffer, int8_t **lowlat_pbs_buffer,
-                     int lwe_dimension, int glwe_dimension, int polynomial_size,
+                     uint64_t **d_lwe_ct_out_array, int lwe_dimension,
+                     int glwe_dimension, int polynomial_size,
                     double lwe_modular_variance, double glwe_modular_variance,
                     int pbs_base_log, int pbs_level, int message_modulus,
                     int carry_modulus, int *payload_modulus, uint64_t *delta,
@@ -93,13 +92,6 @@ void bootstrap_setup(cudaStream_t *stream, Csprng **csprng,
                               (lwe_dimension + 1) * sizeof(uint64_t),
                           stream, gpu_index);

-  scratch_cuda_bootstrap_amortized_64(
-      stream, gpu_index, amortized_pbs_buffer, glwe_dimension, polynomial_size,
-      number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
-  scratch_cuda_bootstrap_low_latency_64(
-      stream, gpu_index, lowlat_pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
-
  cuda_synchronize_stream(v_stream);

  free(lwe_ct_in_array);
@@ -112,9 +104,7 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
                        uint64_t *d_lut_pbs_identity,
                        uint64_t *d_lut_pbs_indexes,
                        uint64_t *d_lwe_ct_in_array,
-                        uint64_t *d_lwe_ct_out_array,
-                        int8_t *amortized_pbs_buffer, int8_t *lowlat_pbs_buffer,
-                        int gpu_index) {
+                        uint64_t *d_lwe_ct_out_array, int gpu_index) {
  void *v_stream = (void *)stream;
  cuda_synchronize_stream(v_stream);

@@ -128,8 +118,6 @@ void bootstrap_teardown(cudaStream_t *stream, Csprng *csprng,
  cuda_drop_async(d_lut_pbs_indexes, stream, gpu_index);
  cuda_drop_async(d_lwe_ct_in_array, stream, gpu_index);
  cuda_drop_async(d_lwe_ct_out_array, stream, gpu_index);
-  cleanup_cuda_bootstrap_amortized(stream, gpu_index, &amortized_pbs_buffer);
-  cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &lowlat_pbs_buffer);
  cuda_destroy_stream(stream, gpu_index);
 }

--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_bootstrap.cpp
@@ -50,8 +50,6 @@ protected:
  uint64_t *d_lwe_ct_in_array;
  uint64_t *d_lwe_ct_out_array;
  uint64_t *lwe_ct_out_array;
-  int8_t *amortized_pbs_buffer;
-  int8_t *lowlat_pbs_buffer;

 public:
  // Test arithmetic functions
@@ -75,11 +73,10 @@ public:
    bootstrap_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
                    &d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
                    &d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
-                    &amortized_pbs_buffer, &lowlat_pbs_buffer, lwe_dimension,
-                    glwe_dimension, polynomial_size, lwe_modular_variance,
-                    glwe_modular_variance, pbs_base_log, pbs_level,
-                    message_modulus, carry_modulus, &payload_modulus, &delta,
-                    number_of_inputs, repetitions, samples, gpu_index);
+                    lwe_dimension, glwe_dimension, polynomial_size,
+                    lwe_modular_variance, glwe_modular_variance, pbs_base_log,
+                    pbs_level, message_modulus, carry_modulus, &payload_modulus,
+                    &delta, number_of_inputs, repetitions, samples, gpu_index);

    lwe_ct_out_array =
        (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
@@ -91,11 +88,16 @@ public:
    bootstrap_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
                       d_fourier_bsk_array, plaintexts, d_lut_pbs_identity,
                       d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_ct_out_array,
-                       amortized_pbs_buffer, lowlat_pbs_buffer, gpu_index);
+                       gpu_index);
  }
 };

 TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_amortized_64(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
+
  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
                 polynomial_size * (lwe_dimension + 1);
  // Here execute the PBS
@@ -112,10 +114,9 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
          stream, gpu_index, (void *)d_lwe_ct_out_array,
          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-          (void *)d_lwe_ct_in, (void *)d_fourier_bsk, amortized_pbs_buffer,
-          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, 1, 0,
-          cuda_get_max_shared_memory(gpu_index));
+          (void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+          number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
      // Copy result back
      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                               (glwe_dimension * polynomial_size + 1) *
@@ -147,9 +148,15 @@ TEST_P(BootstrapTestPrimitives_u64, amortized_bootstrap) {
      }
    }
  }
+  cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
 }

 TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_low_latency_64(
+      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
+
  int number_of_sm = 0;
  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  if (number_of_inputs > number_of_sm * 4 / (glwe_dimension + 1) / pbs_level)
@@ -170,10 +177,9 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          stream, gpu_index, (void *)d_lwe_ct_out_array,
          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-          (void *)d_lwe_ct_in, (void *)d_fourier_bsk, lowlat_pbs_buffer,
-          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, 1, 0,
-          cuda_get_max_shared_memory(gpu_index));
+          (void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+          number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
      // Copy result back
      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                               (glwe_dimension * polynomial_size + 1) *
@@ -204,6 +210,7 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
      }
    }
  }
+  cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
 }

 // Defines for which parameters set the PBS will be tested.