From 3c616af622ab2b37e5f6dadc3dca15adaed77066 Mon Sep 17 00:00:00 2001
From: Quentin Bourgerie <bourgerie.quentin@gmail.com>
Date: Tue, 29 Nov 2022 14:36:24 +0100
Subject: [PATCH] feat(compiler): Handle batched operators for gpu codegen

---
 .../include/concretelang/Runtime/wrappers.h   | 154 +++---------
 .../BConcreteToCAPI/BConcreteToCAPI.cpp       |  20 +-
 compiler/lib/Runtime/wrappers.cpp             | 226 +++++++++++-------
 3 files changed, 194 insertions(+), 206 deletions(-)

diff --git a/compiler/include/concretelang/Runtime/wrappers.h b/compiler/include/concretelang/Runtime/wrappers.h
index 8242b1307..8bf522368 100644
--- a/compiler/include/concretelang/Runtime/wrappers.h
+++ b/compiler/include/concretelang/Runtime/wrappers.h
@@ -149,33 +149,24 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
                           uint64_t *dst_aligned, uint64_t dst_offset,
                           uint64_t dst_size, uint64_t dst_stride);
 
+// Single ciphertext CUDA functions ///////////////////////////////////////////
+
+/// \brief Run Keyswitch on GPU.
+///
+/// It handles memory copy of the different arguments from CPU to GPU, and
+/// freeing memory.
+void memref_keyswitch_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
+    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
+    uint64_t ct0_stride, uint32_t level, uint32_t base_log,
+    uint32_t input_lwe_dim, uint32_t output_lwe_dim,
+    mlir::concretelang::RuntimeContext *context);
+
 /// \brief Run bootstrapping on GPU.
 ///
 /// It handles memory copy of the different arguments from CPU to GPU, and
 /// freeing memory.
-///
-/// \param out_allocated
-/// \param out_aligned
-/// \param out_offset
-/// \param out_size
-/// \param out_stride
-/// \param ct0_allocated
-/// \param ct0_aligned
-/// \param ct0_offset
-/// \param ct0_size
-/// \param ct0_stride
-/// \param tlu_allocated
-/// \param tlu_aligned
-/// \param tlu_offset
-/// \param tlu_size
-/// \param tlu_stride
-/// \param input_lwe_dim LWE input dimension
-/// \param poly_size polynomial size
-/// \param level level
-/// \param base_log base log
-/// \param glwe_dim
-/// \param precision
-/// \param context
 void memref_bootstrap_lwe_cuda_u64(
     uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
     uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
@@ -186,107 +177,26 @@ void memref_bootstrap_lwe_cuda_u64(
     uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
     mlir::concretelang::RuntimeContext *context);
 
-/// \brief Run Keyswitch on GPU.
-///
-/// It handles memory copy of the different arguments from CPU to GPU, and
-/// freeing memory.
-///
-/// \param out_allocated
-/// \param out_aligned
-/// \param out_offset
-/// \param out_size
-/// \param out_stride
-/// \param ct0_allocated
-/// \param ct0_aligned
-/// \param ct0_offset
-/// \param ct0_size
-/// \param ct0_stride
-/// \param level
-/// \param base_log
-/// \param input_lwe_dim LWE input dimension
-/// \param output_lwe_dim LWE output dimension
-/// \param context
-void memref_keyswitch_lwe_cuda_u64(
+// Batched CUDA function //////////////////////////////////////////////////////
+
+void memref_batched_keyswitch_lwe_cuda_u64(
     uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
-    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
-    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
-    uint64_t ct0_stride, uint32_t level, uint32_t base_log,
-    uint32_t input_lwe_dim, uint32_t output_lwe_dim,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
+    uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
     mlir::concretelang::RuntimeContext *context);
 
-/// \brief Copy ciphertext from CPU to GPU.
-///
-/// It handles memory allocation on GPU.
-///
-/// \param ct_allocated
-/// \param ct_aligned
-/// \param ct_offset
-/// \param ct_size
-/// \param ct_stride
-/// \param gpu_idx index of the GPU to use
-/// \param stream cuda stream to use for the copy
-/// \return void* pointer to the GPU ciphertext
-void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
-                             uint64_t ct_offset, uint64_t ct_size,
-                             uint64_t ct_stride, uint32_t gpu_idx,
-                             void *stream);
-
-/// \brief Copy ciphertext from GPU to CPU.
-///
-/// Memory on GPU won't be freed after the copy.
-///
-/// \param out_allocated
-/// \param out_aligned
-/// \param out_offset
-/// \param out_size
-/// \param out_stride
-/// \param ct_gpu
-/// \param size
-/// \param gpu_idx index of the GPU to use
-/// \param stream cuda stream to use for the copy
-void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
-                            uint64_t out_offset, uint64_t out_size,
-                            uint64_t out_stride, void *ct_gpu, size_t size,
-                            uint32_t gpu_idx, void *stream);
-
-/// \brief Copy bootstrapping key from CPU to GPU.
-///
-/// It handles memory allocation on GPU, as well as conversion to the Fourier
-/// domain.
-///
-/// \param context
-/// \param input_lwe_dim
-/// \param poly_size
-/// \param level
-/// \param glwe_dim
-/// \param gpu_idx index of the GPU to use
-/// \param stream cuda stream to use for the copy
-/// \return void*  pointer to the GPU bsk
-void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
-                              uint32_t input_lwe_dim, uint32_t poly_size,
-                              uint32_t level, uint32_t glwe_dim,
-                              uint32_t gpu_idx, void *stream);
-
-/// \brief Copy keyswitching key from CPU to GPU.
-///
-/// It handles memory allocation on GPU.
-///
-/// \param context
-/// \param level
-/// \param input_lwe_dim
-/// \param output_lwe_dim
-/// \param gpu_idx index of the GPU to use
-/// \param stream cuda stream to use for the copy
-/// \return void*  pointer to the GPU ksk
-void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
-                              uint32_t level, uint32_t input_lwe_dim,
-                              uint32_t output_lwe_dim, uint32_t gpu_idx,
-                              void *stream);
-
-/// \brief Free gpu memory.
-///
-/// \param gpu_ptr pointer to the GPU memory to free
-/// \param gpu_idx index of the GPU to use
-void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx);
+void memref_batched_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
+    uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size,
+    uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
+    uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
+    mlir::concretelang::RuntimeContext *context);
 }
 #endif
diff --git a/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp
index e940b238a..fa2d632af 100644
--- a/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp
+++ b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp
@@ -34,6 +34,10 @@ char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
 char memref_await_future[] = "memref_await_future";
 char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64";
 char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
+char memref_batched_keyswitch_lwe_cuda_u64[] =
+    "memref_batched_keyswitch_lwe_cuda_u64";
+char memref_batched_bootstrap_lwe_cuda_u64[] =
+    "memref_batched_bootstrap_lwe_cuda_u64";
 char memref_expand_lut_in_trivial_glwe_ct_u64[] =
     "memref_expand_lut_in_trivial_glwe_ct_u64";
 
@@ -116,12 +120,14 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
                                         memref1DType, i32Type, i32Type, i32Type,
                                         i32Type, i32Type, i32Type, contextType},
                                        {futureType});
-  } else if (funcName == memref_batched_keyswitch_lwe_u64) {
+  } else if (funcName == memref_batched_keyswitch_lwe_u64 ||
+             funcName == memref_batched_keyswitch_lwe_cuda_u64) {
     funcType = mlir::FunctionType::get(rewriter.getContext(),
                                        {memref2DType, memref2DType, i32Type,
                                         i32Type, i32Type, i32Type, contextType},
                                        {});
-  } else if (funcName == memref_batched_bootstrap_lwe_u64) {
+  } else if (funcName == memref_batched_bootstrap_lwe_u64 ||
+             funcName == memref_batched_bootstrap_lwe_cuda_u64) {
     funcType = mlir::FunctionType::get(rewriter.getContext(),
                                        {memref2DType, memref2DType,
                                         memref1DType, i32Type, i32Type, i32Type,
@@ -335,6 +341,16 @@ struct BConcreteToCAPIPass : public BConcreteToCAPIBase<BConcreteToCAPIPass> {
       patterns.add<BConcreteToCAPICallPattern<BConcrete::BootstrapLweBufferOp,
                                               memref_bootstrap_lwe_cuda_u64>>(
           &getContext(), bootstrapAddOperands<BConcrete::BootstrapLweBufferOp>);
+      patterns.add<
+          BConcreteToCAPICallPattern<BConcrete::BatchedKeySwitchLweBufferOp,
+                                     memref_batched_keyswitch_lwe_cuda_u64>>(
+          &getContext(),
+          keyswitchAddOperands<BConcrete::BatchedKeySwitchLweBufferOp>);
+      patterns.add<
+          BConcreteToCAPICallPattern<BConcrete::BatchedBootstrapLweBufferOp,
+                                     memref_batched_bootstrap_lwe_cuda_u64>>(
+          &getContext(),
+          bootstrapAddOperands<BConcrete::BatchedBootstrapLweBufferOp>);
     } else {
       patterns.add<BConcreteToCAPICallPattern<BConcrete::KeySwitchLweBufferOp,
                                               memref_keyswitch_lwe_u64>>(
diff --git a/compiler/lib/Runtime/wrappers.cpp b/compiler/lib/Runtime/wrappers.cpp
index 1abc9138c..84d7f1173 100644
--- a/compiler/lib/Runtime/wrappers.cpp
+++ b/compiler/lib/Runtime/wrappers.cpp
@@ -56,50 +56,7 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size,
 
 #ifdef CONCRETELANG_CUDA_SUPPORT
 
-void memref_keyswitch_lwe_cuda_u64(
-    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
-    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
-    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
-    uint64_t ct0_stride, uint32_t level, uint32_t base_log,
-    uint32_t input_lwe_dim, uint32_t output_lwe_dim,
-    mlir::concretelang::RuntimeContext *context) {
-  // we currently just use the first GPU, but this should be decided
-  // dynamically, or during compilation, in the future
-  uint32_t gpu_idx = 0;
-  uint32_t num_samples = 1;
-  void *stream = cuda_create_stream(gpu_idx);
-  // move input ciphertext into gpu
-  void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
-                                         ct0_size, ct0_stride, gpu_idx, stream);
-  // move output ciphertext into gpu
-  void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
-                                         out_size, out_stride, gpu_idx, stream);
-  void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
-                                          output_lwe_dim, gpu_idx, stream);
-  cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
-                                          input_lwe_dim, output_lwe_dim,
-                                          base_log, level, num_samples);
-  // copy output ciphertext back to cpu
-  memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
-                         out_stride, out_gpu, out_size, gpu_idx, stream);
-  cuda_synchronize_device(gpu_idx);
-  // free memory that we allocated on gpu
-  cuda_drop(ct0_gpu, gpu_idx);
-  cuda_drop(out_gpu, gpu_idx);
-  cuda_drop(ksk_gpu, gpu_idx);
-  cuda_destroy_stream(stream, gpu_idx);
-}
-
-void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
-                             uint64_t ct_offset, uint64_t ct_size,
-                             uint64_t ct_stride, uint32_t gpu_idx,
-                             void *stream) {
-  size_t buf_size = ct_size * sizeof(uint64_t);
-  void *ct_gpu = cuda_malloc(buf_size, gpu_idx);
-  cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream,
-                           gpu_idx);
-  return ct_gpu;
-}
+// CUDA memory utils function /////////////////////////////////////////////////
 
 void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
                               uint32_t input_lwe_dim, uint32_t poly_size,
@@ -117,18 +74,47 @@ void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
                               stream);
 }
 
-void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
-                            uint64_t out_offset, uint64_t out_size,
-                            uint64_t out_stride, void *ct_gpu, size_t size,
-                            uint32_t gpu_idx, void *stream) {
-  cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
-                           size * sizeof(uint64_t), stream, gpu_idx);
+void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
+                                    uint64_t buf_size, uint32_t gpu_idx,
+                                    void *stream) {
+  size_t buf_size_ = buf_size * sizeof(uint64_t);
+  void *ct_gpu = cuda_malloc(buf_size_, gpu_idx);
+  cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_, stream,
+                           gpu_idx);
+  return ct_gpu;
+}
+
+void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset,
+                         uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx,
+                         void *stream) {
+  cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu,
+                           buf_size * sizeof(uint64_t), stream, gpu_idx);
 }
 
 void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
   cuda_drop(gpu_ptr, gpu_idx);
 }
 
+// Single ciphertext CUDA functions ///////////////////////////////////////////
+
+void memref_keyswitch_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
+    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
+    uint64_t ct0_stride, uint32_t level, uint32_t base_log,
+    uint32_t input_lwe_dim, uint32_t output_lwe_dim,
+    mlir::concretelang::RuntimeContext *context) {
+  assert(out_stride == 1);
+  assert(ct0_stride == 1);
+  memref_batched_keyswitch_lwe_cuda_u64(
+      // Output 1D memref as 2D memref
+      out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride,
+      // Output 1D memref as 2D memref
+      ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride,
+      // Keyswitch additional arguments
+      level, base_log, input_lwe_dim, output_lwe_dim, context);
+}
+
 void memref_bootstrap_lwe_cuda_u64(
     uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
     uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
@@ -138,20 +124,96 @@ void memref_bootstrap_lwe_cuda_u64(
     uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
     uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
     mlir::concretelang::RuntimeContext *context) {
-  // we currently just use the first GPU, but this should be decided
-  // dynamically, or during compilation, in the future
+  memref_batched_bootstrap_lwe_cuda_u64(
+      // Output 1D memref as 2D memref
+      out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride,
+      // Input 1D memref as 2D memref
+      ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride,
+      // Table lookup memref
+      tlu_allocated, tlu_aligned, tlu_offset, tlu_size, tlu_stride,
+      // Bootstrap additional arguments
+      input_lwe_dim, poly_size, level, base_log, glwe_dim, precision, context);
+}
+
+// Batched CUDA function //////////////////////////////////////////////////////
+
+void memref_batched_keyswitch_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
+    uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
+    mlir::concretelang::RuntimeContext *context) {
+  assert(out_size0 == ct0_size0);
+  assert(out_size1 == output_lwe_dim+1);
+  assert(ct0_size1 == input_lwe_dim+1);
+  // TODO: Multi GPU
   uint32_t gpu_idx = 0;
+  uint32_t num_samples = out_size0;
+  uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
+  uint64_t out_batch_size = out_size0 * out_size1;
+
+  // Create the cuda stream
+  // TODO: Should be created by the compiler codegen
   void *stream = cuda_create_stream(gpu_idx);
-  // move bsk to gpu
+  // Get the pointer on the keyswitching key on the GPU
+  void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
+                                          output_lwe_dim, gpu_idx, stream);
+  // Move the input and output batch of ciphertexts to the GPU
+  // TODO: The allocation should be done by the compiler codegen
+  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
+  void *out_gpu = alloc_and_memcpy_async_to_gpu(
+      out_aligned, out_offset, out_batch_size, gpu_idx, stream);
+  // Run the keyswitch kernel on the GPU
+  cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
+                                          input_lwe_dim, output_lwe_dim,
+                                          base_log, level, num_samples);
+  // Copy the output batch of ciphertext back to CPU
+  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
+                      stream);
+  cuda_synchronize_device(gpu_idx);
+  // free memory that we allocated on gpu
+  cuda_drop(ct0_gpu, gpu_idx);
+  cuda_drop(out_gpu, gpu_idx);
+  cuda_drop(ksk_gpu, gpu_idx);
+  cuda_destroy_stream(stream, gpu_idx);
+}
+
+void memref_batched_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
+    uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size,
+    uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
+    uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
+    mlir::concretelang::RuntimeContext *context) {
+  assert(out_size0 == ct0_size0);
+  // TODO: Multi GPU
+  uint32_t gpu_idx = 0;
+  uint32_t num_samples = out_size0;
+  uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
+  uint64_t out_batch_size = out_size0 * out_size1;
+
+  // Create the cuda stream
+  // TODO: Should be created by the compiler codegen
+  void *stream = cuda_create_stream(gpu_idx);
+  // Get the pointer on the bootstraping key on the GPU
   void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
                                            level, glwe_dim, gpu_idx, stream);
-  // move input ciphertext into gpu
-  void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
-                                         ct0_size, ct0_stride, gpu_idx, stream);
-  // move output ciphertext into gpu
-  void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
-                                         out_size, out_stride, gpu_idx, stream);
-  // construct LUT GLWE ciphertext
+  // Move the input and output batch of ciphertext to the GPU
+  // TODO: The allocation should be done by the compiler codegen
+  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
+  void *out_gpu = alloc_and_memcpy_async_to_gpu(
+      out_aligned, out_offset, out_batch_size, gpu_idx, stream);
+
+  // Construct the glwe accumulator (on CPU)
+  // TODO: Should be done outside of the bootstrap call, compile time if
+  // possible. Refactor in progress
   uint64_t glwe_ct_len = poly_size * (glwe_dim + 1);
   uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
   uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size);
@@ -162,35 +224,35 @@ void memref_bootstrap_lwe_cuda_u64(
       default_engine_discard_trivially_encrypt_glwe_ciphertext_u64_raw_ptr_buffers(
           get_levelled_engine(), glwe_ct, glwe_ct_len,
           expanded_tabulated_function_array.data(), poly_size));
-  // move test vector into gpu
-  void *test_vector_gpu =
-      cuda_malloc(poly_size * (glwe_dim + 1) * sizeof(uint64_t), gpu_idx);
-  cuda_memcpy_async_to_gpu(test_vector_gpu, (void *)glwe_ct, glwe_ct_size,
-                           stream, gpu_idx);
-  // free LUT ciphertext (CPU)
+
+  // Move the glwe accumulator to the GPU
+  void *glwe_ct_gpu =
+      alloc_and_memcpy_async_to_gpu(glwe_ct, 0, glwe_ct_size, gpu_idx, stream);
+
+  // Free the glwe accumulator (on CPU)
   free(glwe_ct);
-  // move test vector indexes into gpu
-  uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0;
-  void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t));
-  ((uint32_t *)test_vector_idxes)[0] = 0;
-  void *test_vector_idxes_gpu =
-      cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx);
+
+  // Move test vector indexes to the GPU, the test vector indexes is set of 0
+  uint32_t num_test_vectors = 1, lwe_idx = 0,
+           test_vector_idxes_size = num_samples * sizeof(uint32_t);
+  void *test_vector_idxes = malloc(test_vector_idxes_size);
+  memset(test_vector_idxes, 0, test_vector_idxes_size);
+  void *test_vector_idxes_gpu = cuda_malloc(test_vector_idxes_size, gpu_idx);
   cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
-                           num_samples * sizeof(uint32_t), stream, gpu_idx);
-  // run gpu bootstrap
+                           test_vector_idxes_size, stream, gpu_idx);
+  // Run the bootstrap kernel on the GPU
   cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-      stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu,
-      fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level,
-      num_samples, num_test_vectors, lwe_idx,
-      cuda_get_max_shared_memory(gpu_idx));
-  // copy output ciphertext back to cpu
-  memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
-                         out_stride, out_gpu, out_size, gpu_idx, stream);
+      stream, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, fbsk_gpu,
+      input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples,
+      num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
+  // Copy the output batch of ciphertext back to CPU
+  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu,
+                         gpu_idx, stream);
   cuda_synchronize_device(gpu_idx);
   // free memory that we allocated on gpu
   cuda_drop(ct0_gpu, gpu_idx);
   cuda_drop(out_gpu, gpu_idx);
-  cuda_drop(test_vector_gpu, gpu_idx);
+  cuda_drop(glwe_ct_gpu, gpu_idx);
   cuda_drop(test_vector_idxes_gpu, gpu_idx);
 
   cuda_destroy_stream(stream, gpu_idx);