refactor(cuda): introduce scratch for amortized PBS

2026-02-08 11:35:02 -05:00 · 2023-02-16 10:04:50 +01:00
parent 2a487ffbfd
commit 5cd0cb5d19
10 changed files with 612 additions and 247 deletions
--- a/include/bootstrap.h
+++ b/include/bootstrap.h
@@ -17,19 +17,40 @@ void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
                                       uint32_t glwe_dim, uint32_t level_count,
                                       uint32_t polynomial_size);

+void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer,
+                                         uint32_t glwe_dimension,
+                                         uint32_t polynomial_size,
+                                         uint32_t input_lwe_ciphertext_count,
+                                         uint32_t max_shared_memory,
+                                         bool allocate_gpu_memory);
+
+void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer,
+                                         uint32_t glwe_dimension,
+                                         uint32_t polynomial_size,
+                                         uint32_t input_lwe_ciphertext_count,
+                                         uint32_t max_shared_memory,
+                                         bool allocate_gpu_memory);
+
 void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
-    void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
+    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
+    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t max_shared_memory);

 void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
-    void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
+    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
+    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t max_shared_memory);
+
+void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
+                                      int8_t **pbs_buffer);

 void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
@@ -67,26 +88,6 @@ void cuda_extract_bits_64(
    uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
    uint32_t max_shared_memory);

-void cuda_circuit_bootstrap_32(
-    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
-    void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
-    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
-    uint32_t number_of_samples, uint32_t max_shared_memory);
-
-void cuda_circuit_bootstrap_64(
-    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
-    void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
-    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
-    uint32_t number_of_samples, uint32_t max_shared_memory);
-
 void scratch_cuda_circuit_bootstrap_vertical_packing_32(
    void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
    uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
--- a/include/circuit_bootstrap.h
+++ b/include/circuit_bootstrap.h
@@ -0,0 +1,42 @@
+#ifndef CUDA_CIRCUIT_BOOTSTRAP_H
+#define CUDA_CIRCUIT_BOOTSTRAP_H
+
+#include <cstdint>
+
+extern "C" {
+
+void scratch_cuda_circuit_bootstrap_32(
+    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory, bool allocate_gpu_memory);
+
+void scratch_cuda_circuit_bootstrap_64(
+    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory, bool allocate_gpu_memory);
+
+void cuda_circuit_bootstrap_32(
+    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
+    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
+    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
+    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
+    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory);
+
+void cuda_circuit_bootstrap_64(
+    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
+    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
+    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
+    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
+    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory);
+
+void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
+                                    int8_t **cbs_buffer);
+}
+
+#endif // CUDA_CIRCUIT_BOOTSTRAP_H
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,7 +3,8 @@ set(SOURCES
        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
+        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
+        ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h)
 file(GLOB SOURCES
     "*.cu"
     "*.h"
--- a/src/boolean_gates.cu
+++ b/src/boolean_gates.cu
@@ -93,11 +93,16 @@ extern "C" void cuda_boolean_and_32(
      stream, gpu_index);
  check_cuda_error(cudaGetLastError());

+  int8_t *pbs_buffer = nullptr;
+  scratch_cuda_bootstrap_amortized_32(
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_base_log, pbs_level_count,
+      lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
+  cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
  check_cuda_error(cudaGetLastError());

  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
@@ -196,11 +201,16 @@ extern "C" void cuda_boolean_nand_32(
      stream, gpu_index);
  check_cuda_error(cudaGetLastError());

+  int8_t *pbs_buffer = nullptr;
+  scratch_cuda_bootstrap_amortized_32(
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_base_log, pbs_level_count,
+      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
+  cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
  check_cuda_error(cudaGetLastError());

  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
@@ -299,11 +309,16 @@ extern "C" void cuda_boolean_nor_32(
      stream, gpu_index);
  check_cuda_error(cudaGetLastError());

+  int8_t *pbs_buffer = nullptr;
+  scratch_cuda_bootstrap_amortized_32(
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_base_log, pbs_level_count,
+      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
+  cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
  check_cuda_error(cudaGetLastError());

  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
@@ -394,11 +409,16 @@ extern "C" void cuda_boolean_or_32(
      stream, gpu_index);
  check_cuda_error(cudaGetLastError());

+  int8_t *pbs_buffer = nullptr;
+  scratch_cuda_bootstrap_amortized_32(
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_2, bootstrapping_key, input_lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_base_log, pbs_level_count,
+      lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
+  cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
  check_cuda_error(cudaGetLastError());

  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
@@ -510,11 +530,16 @@ extern "C" void cuda_boolean_xor_32(
      stream, gpu_index);
  check_cuda_error(cudaGetLastError());

+  int8_t *pbs_buffer = nullptr;
+  scratch_cuda_bootstrap_amortized_32(
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_3, bootstrapping_key, input_lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_base_log, pbs_level_count,
+      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
+  cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
  check_cuda_error(cudaGetLastError());

  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
@@ -633,11 +658,16 @@ extern "C" void cuda_boolean_xnor_32(
      stream, gpu_index);
  check_cuda_error(cudaGetLastError());

+  int8_t *pbs_buffer = nullptr;
+  scratch_cuda_bootstrap_amortized_32(
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_4, bootstrapping_key, input_lwe_dimension, glwe_dimension,
-      polynomial_size, pbs_base_log, pbs_level_count,
+      lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
+  cleanup_cuda_bootstrap_amortized(v_stream, gpu_index, &pbs_buffer);
  check_cuda_error(cudaGetLastError());

  cuda_drop_async(lwe_buffer_4, stream, gpu_index);
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -1,15 +1,113 @@
 #include "bootstrap_amortized.cuh"

+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the amortized PBS on 32 bits inputs, into `cmux_tree_buffer`. It also
+ * configures SM options on the GPU in case FULLSM mode is going to be used.
+ */
+void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer,
+                                         uint32_t glwe_dimension,
+                                         uint32_t polynomial_size,
+                                         uint32_t input_lwe_ciphertext_count,
+                                         uint32_t max_shared_memory,
+                                         bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<512>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<1024>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<2048>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<4096>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<8192>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the amortized PBS on 64 bits inputs, into `cmux_tree_buffer`. It also
+ * configures SM options on the GPU in case FULLSM mode is going to be used.
+ */
+void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer,
+                                         uint32_t glwe_dimension,
+                                         uint32_t polynomial_size,
+                                         uint32_t input_lwe_ciphertext_count,
+                                         uint32_t max_shared_memory,
+                                         bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<512>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<1024>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<2048>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<4096>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<8192>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
 /* Perform the programmable bootstrapping on a batch of input u32 LWE
 * ciphertexts. See the corresponding operation on 64 bits for more details.
 */
-
 void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t max_shared_memory) {

  assert(
      ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
@@ -25,7 +123,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<256>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -33,7 +131,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -41,7 +139,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<1024>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -49,7 +147,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<2048>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -57,7 +155,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<4096>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -65,7 +163,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<8192>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -142,9 +240,10 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
 void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
+    uint32_t max_shared_memory) {

  assert(
      ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
@@ -160,7 +259,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<256>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -168,7 +267,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -176,7 +275,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<1024>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -184,7 +283,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<2048>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -192,7 +291,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<4096>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -200,7 +299,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<8192>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
@@ -208,3 +307,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    break;
  }
 }
+
+/*
+ * This cleanup function frees the data for the amortized PBS on GPU in
+ * pbs_buffer for 32 or 64 bits inputs.
+ */
+void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
+                                      int8_t **pbs_buffer) {
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  // Free memory
+  cuda_drop_async(*pbs_buffer, stream, gpu_index);
+}
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -213,27 +213,100 @@ __global__ void device_bootstrap_amortized(
                                     glwe_dimension);
 }

+template <typename Torus>
+__host__ __device__ int
+get_buffer_size_full_sm_bootstrap_amortized(uint32_t polynomial_size,
+                                            uint32_t glwe_dimension) {
+  return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
+         sizeof(Torus) * polynomial_size *
+             (glwe_dimension + 1) + // accumulator rotated
+         sizeof(double2) * polynomial_size / 2 *
+             (glwe_dimension + 1) + // accumulator fft
+         sizeof(double2) * polynomial_size / 2 *
+             (glwe_dimension + 1); // calculate buffer fft
+}
+
+template <typename Torus>
+__host__ __device__ int
+get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size,
+                                               uint32_t glwe_dimension) {
+  return sizeof(double2) * polynomial_size / 2 *
+         (glwe_dimension + 1); // calculate buffer fft
+}
+
+template <typename Torus>
+__host__ __device__ int get_buffer_size_bootstrap_amortized(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+  int partial_sm = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+  int partial_dm = full_sm - partial_sm;
+  int full_dm = full_sm;
+  int device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count;
+  } else if (max_shared_memory < full_sm) {
+    device_mem = partial_dm * input_lwe_ciphertext_count;
+  }
+  return device_mem;
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
+                                          int8_t **pbs_buffer,
+                                          uint32_t glwe_dimension,
+                                          uint32_t polynomial_size,
+                                          uint32_t input_lwe_ciphertext_count,
+                                          uint32_t max_shared_memory,
+                                          bool allocate_gpu_memory) {
+  cudaSetDevice(gpu_index);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+
+  int full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+  int partial_sm = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
+    cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         partial_sm);
+    cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
+                           cudaFuncCachePreferShared);
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+  if (allocate_gpu_memory) {
+    int buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
+        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+        max_shared_memory);
+    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
+    check_cuda_error(cudaGetLastError());
+  }
+}
+
 template <typename Torus, class params>
 __host__ void host_bootstrap_amortized(
    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
    Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count,
+    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
    uint32_t lwe_idx, uint32_t max_shared_memory) {

  cudaSetDevice(gpu_index);
-  int SM_FULL =
-      sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
-      sizeof(Torus) * polynomial_size *
-          (glwe_dimension + 1) + // accumulator rotated
-      sizeof(double2) * polynomial_size / 2 *
-          (glwe_dimension + 1) + // accumulator fft
-      sizeof(double2) * polynomial_size / 2 *
-          (glwe_dimension + 1); // calculate buffer fft
+  int SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);

-  int SM_PART = sizeof(double2) * polynomial_size / 2 *
-                (glwe_dimension + 1); // calculate buffer fft
+  int SM_PART = get_buffer_size_partial_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);

  int DM_PART = SM_FULL - SM_PART;

@@ -241,8 +314,6 @@ __host__ void host_bootstrap_amortized(

  auto stream = static_cast<cudaStream_t *>(v_stream);

-  int8_t *d_mem;
-
  // Create a 1-dimensional grid of threads
  // where each block handles 1 sample and each thread
  // handles opt polynomial coefficients
@@ -257,23 +328,15 @@ __host__ void host_bootstrap_amortized(
  // from one of three templates (no use, partial use or full use
  // of shared memory)
  if (max_shared_memory < SM_PART) {
-    d_mem = (int8_t *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count,
-                                        stream, gpu_index);
    device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
        lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-        bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
  } else if (max_shared_memory < SM_FULL) {
-    cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
-    cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
-                           cudaFuncCachePreferShared);
-    d_mem = (int8_t *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count,
-                                        stream, gpu_index);
    device_bootstrap_amortized<Torus, params, PARTIALSM>
        <<<grid, thds, SM_PART, *stream>>>(
            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+            bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
            polynomial_size, base_log, level_count, lwe_idx, DM_PART);
  } else {
    // For devices with compute capability 7.x a single thread block can
@@ -281,22 +344,13 @@ __host__ void host_bootstrap_amortized(
    // device then has to be allocated dynamically.
    // For lower compute capabilities, this call
    // just does nothing and the amount of shared memory used is 48 KB
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared));
-    d_mem = (int8_t *)cuda_malloc_async(0, stream, gpu_index);
-
    device_bootstrap_amortized<Torus, params, FULLSM>
        <<<grid, thds, SM_FULL, *stream>>>(
            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+            bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
            polynomial_size, base_log, level_count, lwe_idx, 0);
  }
  check_cuda_error(cudaGetLastError());
-  cuda_drop_async(d_mem, stream, gpu_index);
 }

 template <typename Torus, class params>
--- a/src/circuit_bootstrap.cu
+++ b/src/circuit_bootstrap.cu
@@ -1,4 +1,99 @@
 #include "circuit_bootstrap.cuh"
+#include "circuit_bootstrap.h"
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
+ * configures SM options on the GPU in case FULLSM mode is going to be used.
+ */
+void scratch_cuda_circuit_bootstrap_32(
+    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 512:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<1024>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<2048>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<4096>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<8192>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
+ * configures SM options on the GPU in case FULLSM mode is going to be used.
+ */
+void scratch_cuda_circuit_bootstrap_64(
+    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 512:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<1024>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<2048>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<4096>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<8192>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}

 /*
 * Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
@@ -6,13 +101,12 @@
 */
 void cuda_circuit_bootstrap_32(
    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
-    void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
-    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
-    uint32_t number_of_samples, uint32_t max_shared_memory) {
+    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
+    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
+    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
+    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory) {
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
          "512, 1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -28,61 +122,51 @@ void cuda_circuit_bootstrap_32(
          "equal to the "
          "number of streaming multiprocessors on the device divided by 8 * "
          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_bsk));
+          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
  switch (polynomial_size) {
  case 512:
    host_circuit_bootstrap<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
-        (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 1024:
    host_circuit_bootstrap<uint32_t, Degree<1024>>(
        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
-        (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 2048:
    host_circuit_bootstrap<uint32_t, Degree<2048>>(
        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
-        (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 4096:
    host_circuit_bootstrap<uint32_t, Degree<4096>>(
        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
-        (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 8192:
    host_circuit_bootstrap<uint32_t, Degree<8192>>(
        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer,
-        (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  default:
@@ -96,31 +180,20 @@ void cuda_circuit_bootstrap_32(
 * launch
 * - `gpu_index` is the index of the GPU to be used in the kernel launch
 *  - 'ggsw_out' output batch of ggsw with size:
- * 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
+ * 'number_of_inputs' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
 * polynomial_size * sizeof(u64)
 *  - 'lwe_array_in' input batch of lwe ciphertexts, with size:
- * 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64)
+ * 'number_of_inputs' * '(lwe_dimension' + 1) * sizeof(u64)
 *  - 'fourier_bsk' bootstrapping key in fourier domain with size:
 * 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
 * 'polynomial_size' / 2 * sizeof(double2)
 *  - 'fp_ksk_array' batch of fp-keyswitch keys with size:
 * ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
 * 'polynomial_size' * sizeof(u64)
- *  The following 5 parameters are used during calculations, they are not actual
- *  inputs of the function, they are just allocated memory for calculation
+ *  - 'cbs_buffer': buffer used during calculations, it is not an actual
+ *  inputs of the function, just allocated memory for calculation
 *  process, like this, memory can be allocated once and can be used as much
 *  as needed for different calls of circuit_bootstrap function
- *  - 'lwe_array_in_shifted_buffer' with size:
- * 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64)
- *  - 'lut_vector' with size:
- * 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64)
- *  - 'lut_vector_indexes' stores the index corresponding to which test
- *  vector to use
- *  - 'lwe_array_out_pbs_buffer' with size
- * 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64)
- *  - 'lwe_array_in_fp_ks_buffer' with size
- * 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) *
- * ('polynomial_size' + 1) * sizeof(u64)
 *
 * This function calls a wrapper to a device kernel that performs the
 * circuit bootstrap. The kernel is templatized based on integer discretization
@@ -128,13 +201,12 @@ void cuda_circuit_bootstrap_32(
 */
 void cuda_circuit_bootstrap_64(
    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer,
-    void *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
-    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
-    uint32_t number_of_samples, uint32_t max_shared_memory) {
+    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
+    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
+    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
+    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory) {
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
          "512, 1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -150,65 +222,66 @@ void cuda_circuit_bootstrap_64(
          "equal to the "
          "number of streaming multiprocessors on the device divided by 8 * "
          "level_count_bsk",
-          number_of_samples <= number_of_sm / 4. / 2. / level_bsk));
+          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
  // The number of samples should be lower than the number of streaming
  switch (polynomial_size) {
  case 512:
    host_circuit_bootstrap<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
-        (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 1024:
    host_circuit_bootstrap<uint64_t, Degree<1024>>(
        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
-        (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 2048:
    host_circuit_bootstrap<uint64_t, Degree<2048>>(
        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
-        (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 4096:
    host_circuit_bootstrap<uint64_t, Degree<4096>>(
        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
-        (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  case 8192:
    host_circuit_bootstrap<uint64_t, Degree<8192>>(
        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer,
-        (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_samples,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
        max_shared_memory);
    break;
  default:
    break;
  }
 }
+
+/*
+ * This cleanup function frees the data for the circuit bootstrap on GPU in
+ * cbs_buffer for 32 or 64 bits inputs.
+ */
+void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
+                                    int8_t **cbs_buffer) {
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  // Free memory
+  cuda_drop_async(*cbs_buffer, stream, gpu_index);
+}
--- a/src/circuit_bootstrap.cuh
+++ b/src/circuit_bootstrap.cuh
@@ -1,5 +1,5 @@
-#ifndef CBS_H
-#define CBS_H
+#ifndef CBS_CUH
+#define CBS_CUH

 #include "bit_extraction.cuh"
 #include "bootstrap.h"
@@ -96,6 +96,51 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
  }
 }

+template <typename Torus>
+__host__ __device__ int
+get_buffer_size_cbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
+                    uint32_t polynomial_size, uint32_t level_count_cbs,
+                    uint32_t number_of_inputs) {
+
+  return number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
+             (polynomial_size + 1) *
+             sizeof(Torus) + // lwe_array_in_fp_ks_buffer
+         number_of_inputs * level_count_cbs * (polynomial_size + 1) *
+             sizeof(Torus) + // lwe_array_out_pbs_buffer
+         number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
+             sizeof(Torus) + // lwe_array_in_shifted_buffer
+         level_count_cbs * (glwe_dimension + 1) * polynomial_size *
+             sizeof(Torus); // lut_vector_cbs
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_circuit_bootstrap(
+    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+
+  cudaSetDevice(gpu_index);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+
+  int pbs_count = number_of_inputs * level_count_cbs;
+  // allocate and initialize device pointers for circuit bootstrap and vertical
+  // packing
+  if (allocate_gpu_memory) {
+    int buffer_size =
+        get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
+                                   polynomial_size, level_count_cbs,
+                                   number_of_inputs) +
+        get_buffer_size_bootstrap_amortized<Torus>(
+            glwe_dimension, polynomial_size, pbs_count, max_shared_memory);
+    *cbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
+  }
+
+  scratch_bootstrap_amortized<Torus, STorus, params>(
+      v_stream, gpu_index, cbs_buffer, glwe_dimension, polynomial_size,
+      pbs_count, max_shared_memory, false);
+}
+
 /*
 * Host function for cuda circuit bootstrap.
 * It executes device functions in specific order and manages
@@ -104,24 +149,37 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
 template <typename Torus, class params>
 __host__ void host_circuit_bootstrap(
    void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
-    double2 *fourier_bsk, Torus *fp_ksk_array,
-    Torus *lwe_array_in_shifted_buffer, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer,
-    Torus *lwe_array_in_fp_ks_buffer, uint32_t delta_log,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
-    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
-    uint32_t number_of_samples, uint32_t max_shared_memory) {
+    double2 *fourier_bsk, Torus *fp_ksk_array, Torus *lut_vector_indexes,
+    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
+    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
+    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
+    uint32_t max_shared_memory) {
  cudaSetDevice(gpu_index);
  auto stream = static_cast<cudaStream_t *>(v_stream);

  uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
  uint32_t lwe_size = lwe_dimension + 1;
-  int pbs_count = number_of_samples * level_cbs;
+  int pbs_count = number_of_inputs * level_cbs;

-  dim3 blocks(level_cbs, number_of_samples, 1);
+  dim3 blocks(level_cbs, number_of_inputs, 1);
  int threads = 256;

+  Torus *lwe_array_in_fp_ks_buffer = (Torus *)cbs_buffer;
+  Torus *lwe_array_out_pbs_buffer =
+      (Torus *)lwe_array_in_fp_ks_buffer +
+      (ptrdiff_t)(number_of_inputs * level_cbs * (glwe_dimension + 1) *
+                  (polynomial_size + 1));
+  Torus *lwe_array_in_shifted_buffer =
+      (Torus *)lwe_array_out_pbs_buffer +
+      (ptrdiff_t)(number_of_inputs * level_cbs * (polynomial_size + 1));
+  Torus *lut_vector =
+      (Torus *)lwe_array_in_shifted_buffer +
+      (ptrdiff_t)(number_of_inputs * level_cbs * (lwe_dimension + 1));
+  int8_t *pbs_buffer =
+      (int8_t *)lut_vector + (ptrdiff_t)(level_cbs * (glwe_dimension + 1) *
+                                         polynomial_size * sizeof(Torus));
+
  // Shift message LSB on padding bit, at this point we expect to have messages
  // with only 1 bit of information
  shift_lwe_cbs<Torus, params><<<blocks, threads, 0, *stream>>>(
@@ -143,7 +201,7 @@ __host__ void host_circuit_bootstrap(
  // MSB and no bit of padding
  host_bootstrap_amortized<Torus, params>(
      v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
-      lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
+      lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, pbs_buffer,
      glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
      pbs_count, level_cbs, 0, max_shared_memory);

@@ -161,4 +219,4 @@ __host__ void host_circuit_bootstrap(
      level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1);
 }

-#endif // CBS_H
+#endif // CBS_CUH
--- a/src/vertical_packing.cuh
+++ b/src/vertical_packing.cuh
@@ -1,7 +1,6 @@
 #ifndef VERTICAL_PACKING_CUH
 #define VERTICAL_PACKING_CUH

-#include "../include/vertical_packing.h"
 #include "bootstrap.h"
 #include "complex/operations.cuh"
 #include "crypto/gadget.cuh"
--- a/src/wop_bootstrap.cuh
+++ b/src/wop_bootstrap.cuh
@@ -28,26 +28,17 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in,

 template <typename Torus>
 __host__ __device__ int
-get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t lwe_dimension,
-                       uint32_t polynomial_size, uint32_t level_count_cbs,
-                       uint32_t number_of_inputs, uint32_t tau) {
+get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t polynomial_size,
+                       uint32_t level_count_cbs, uint32_t tau,
+                       uint32_t number_of_inputs) {

  int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
                  (glwe_dimension + 1) * polynomial_size;
  return number_of_inputs * level_count_cbs *
-             sizeof(Torus)                              // lut_vector_indexes
-         + number_of_inputs * ggsw_size * sizeof(Torus) // ggsw_out
-         +
-         number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
-             (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_fp_ks_buffer
-         + number_of_inputs * level_count_cbs * (polynomial_size + 1) *
-               sizeof(Torus) // lwe_array_out_pbs_buffer
-         + number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
-               sizeof(Torus) // lwe_array_in_shifted_buffer
-         + level_count_cbs * (glwe_dimension + 1) * polynomial_size *
-               sizeof(Torus) // lut_vector_cbs
-         + tau * (glwe_dimension + 1) * polynomial_size *
-               sizeof(Torus); // glwe_array_out
+             sizeof(Torus) +                            // lut_vector_indexes
+         number_of_inputs * ggsw_size * sizeof(Torus) + // ggsw_out_cbs
+         tau * (glwe_dimension + 1) * polynomial_size *
+             sizeof(Torus); // glwe_array_out_cmux_tree
 }

 template <typename Torus, typename STorus, typename params>
@@ -70,9 +61,14 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
  // packing
  if (allocate_gpu_memory) {
    int buffer_size =
-        get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
-                                      polynomial_size, level_count_cbs,
-                                      number_of_inputs, tau) +
+        get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
+                                      level_count_cbs, tau, number_of_inputs) +
+        get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
+                                   polynomial_size, level_count_cbs,
+                                   number_of_inputs) +
+        get_buffer_size_bootstrap_amortized<Torus>(
+            glwe_dimension, polynomial_size, number_of_inputs * level_count_cbs,
+            max_shared_memory) +
        get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
                                         level_count_cbs, r, tau,
                                         max_shared_memory) +
@@ -96,6 +92,10 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(

  uint32_t bits = sizeof(Torus) * 8;
  *cbs_delta_log = (bits - 1);
+  scratch_circuit_bootstrap<Torus, STorus, params>(
+      v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, lwe_dimension,
+      polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+      false);
  scratch_cmux_tree<Torus, STorus, params>(
      v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
      level_count_cbs, r, tau, max_shared_memory, false);
@@ -107,13 +107,8 @@ __host__ void scratch_circuit_bootstrap_vertical_packing(
 /*
 * Cleanup functions free the necessary data on the GPU and on the CPU.
 * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
- * frees the data for the circuit bootstrap and vertical packing on GPU:
- * - ggsw_out
- * - lwe_array_in_fp_ks_buffer
- * - lwe_array_out_pbs_buffer
- * - lwe_array_in_shifted buffer
- * - lut_vector_cbs
- * - lut_vector_indexes
+ * frees the data for the circuit bootstrap and vertical packing on GPU
+ * contained in cbs_vp_buffer
 */
 __host__ void
 cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
@@ -139,32 +134,25 @@ __host__ void host_circuit_bootstrap_vertical_packing(

  int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
                  (glwe_dimension + 1) * polynomial_size;
-  Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
-  Torus *ggsw_out = (Torus *)lut_vector_indexes +
-                    (ptrdiff_t)(number_of_inputs * level_count_cbs);
-  Torus *lwe_array_in_fp_ks_buffer =
-      (Torus *)ggsw_out + (ptrdiff_t)(number_of_inputs * ggsw_size);
-  Torus *lwe_array_out_pbs_buffer =
-      (Torus *)lwe_array_in_fp_ks_buffer +
-      (ptrdiff_t)(number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
-                  (polynomial_size + 1));
-  Torus *lwe_array_in_shifted_buffer =
-      (Torus *)lwe_array_out_pbs_buffer +
-      (ptrdiff_t)(number_of_inputs * level_count_cbs * (polynomial_size + 1));
-  Torus *lut_vector_cbs =
-      (Torus *)lwe_array_in_shifted_buffer +
-      (ptrdiff_t)(number_of_inputs * level_count_cbs * (lwe_dimension + 1));
-  Torus *glwe_array_out =
-      (Torus *)lut_vector_cbs +
-      (ptrdiff_t)(level_count_cbs * (glwe_dimension + 1) * polynomial_size);

+  Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
+  int8_t *cbs_buffer =
+      (int8_t *)lut_vector_indexes +
+      (ptrdiff_t)(number_of_inputs * level_count_cbs * sizeof(Torus));
+  int8_t *ggsw_out_cbs =
+      cbs_buffer +
+      (ptrdiff_t)(get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
+                                             polynomial_size, level_count_cbs,
+                                             number_of_inputs) +
+                  get_buffer_size_bootstrap_amortized<Torus>(
+                      glwe_dimension, polynomial_size,
+                      number_of_inputs * level_count_cbs, max_shared_memory));
  host_circuit_bootstrap<Torus, params>(
-      v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk,
-      lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes,
-      lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, cbs_delta_log,
-      polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
-      base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
-      base_log_cbs, number_of_inputs, max_shared_memory);
+      v_stream, gpu_index, (Torus *)ggsw_out_cbs, lwe_array_in, fourier_bsk,
+      cbs_fpksk, lut_vector_indexes, cbs_buffer, cbs_delta_log, polynomial_size,
+      glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk,
+      level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs,
+      number_of_inputs, max_shared_memory);
  check_cuda_error(cudaGetLastError());

  // number_of_inputs = tau * p is the total number of GGSWs
@@ -172,29 +160,33 @@ __host__ void host_circuit_bootstrap_vertical_packing(
  // lsb GGSW is for the last blind rotation.
  uint32_t r = number_of_inputs - params::log2_degree;
  int8_t *cmux_tree_buffer =
-      (int8_t *)glwe_array_out +
-      (ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+      ggsw_out_cbs + (ptrdiff_t)(number_of_inputs * ggsw_size * sizeof(Torus));
+  int8_t *glwe_array_out_cmux_tree =
+      cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus>(
+                             glwe_dimension, polynomial_size, level_count_cbs,
+                             r, tau, max_shared_memory));
  // CMUX Tree
  // r = tau * p - log2(N)
  host_cmux_tree<Torus, STorus, params>(
-      v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector,
-      cmux_tree_buffer, glwe_dimension, polynomial_size, base_log_cbs,
-      level_count_cbs, r, tau, max_shared_memory);
+      v_stream, gpu_index, (Torus *)glwe_array_out_cmux_tree,
+      (Torus *)ggsw_out_cbs, lut_vector, cmux_tree_buffer, glwe_dimension,
+      polynomial_size, base_log_cbs, level_count_cbs, r, tau,
+      max_shared_memory);
  check_cuda_error(cudaGetLastError());

  // Blind rotation + sample extraction
  // mbr = tau * p - r = log2(N)
-  Torus *br_ggsw = (Torus *)ggsw_out +
+  Torus *br_ggsw = (Torus *)ggsw_out_cbs +
                   (ptrdiff_t)(r * level_count_cbs * (glwe_dimension + 1) *
                               (glwe_dimension + 1) * polynomial_size);
  int8_t *br_se_buffer =
-      cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus>(
-                             glwe_dimension, polynomial_size, level_count_cbs,
-                             r, tau, max_shared_memory));
+      glwe_array_out_cmux_tree +
+      (ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
  host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
-      v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out, br_se_buffer,
-      number_of_inputs - r, tau, glwe_dimension, polynomial_size, base_log_cbs,
-      level_count_cbs, max_shared_memory);
+      v_stream, gpu_index, lwe_array_out, br_ggsw,
+      (Torus *)glwe_array_out_cmux_tree, br_se_buffer, number_of_inputs - r,
+      tau, glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs,
+      max_shared_memory);
 }

 template <typename Torus>
@@ -238,9 +230,14 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
  uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
  uint32_t mbr_size = cbs_vp_number_of_inputs - r;
  int buffer_size =
-      get_buffer_size_cbs_vp<Torus>(glwe_dimension, lwe_dimension,
-                                    polynomial_size, level_count_cbs,
-                                    cbs_vp_number_of_inputs, tau) +
+      get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
+                                    level_count_cbs, tau,
+                                    cbs_vp_number_of_inputs) +
+      get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
+                                 level_count_cbs, cbs_vp_number_of_inputs) +
+      get_buffer_size_bootstrap_amortized<Torus>(
+          glwe_dimension, polynomial_size,
+          cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
      get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
                                       level_count_cbs, r, tau,
                                       max_shared_memory) +