feat(compiler): Handle batched operators for gpu codegen

This commit is contained in:
Quentin Bourgerie
2022-11-29 14:36:24 +01:00
parent 312c9281eb
commit 3c616af622
3 changed files with 194 additions and 206 deletions

View File

@@ -149,33 +149,24 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
uint64_t *dst_aligned, uint64_t dst_offset,
uint64_t dst_size, uint64_t dst_stride);
// Single ciphertext CUDA functions ///////////////////////////////////////////
/// \brief Run Keyswitch on GPU.
///
/// It handles memory copy of the different arguments from CPU to GPU, and
/// freeing memory.
void memref_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context);
/// \brief Run bootstrapping on GPU.
///
/// It handles memory copy of the different arguments from CPU to GPU, and
/// freeing memory.
///
/// \param out_allocated
/// \param out_aligned
/// \param out_offset
/// \param out_size
/// \param out_stride
/// \param ct0_allocated
/// \param ct0_aligned
/// \param ct0_offset
/// \param ct0_size
/// \param ct0_stride
/// \param tlu_allocated
/// \param tlu_aligned
/// \param tlu_offset
/// \param tlu_size
/// \param tlu_stride
/// \param input_lwe_dim LWE input dimension
/// \param poly_size polynomial size
/// \param level level
/// \param base_log base log
/// \param glwe_dim
/// \param precision
/// \param context
void memref_bootstrap_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
@@ -186,107 +177,26 @@ void memref_bootstrap_lwe_cuda_u64(
uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
mlir::concretelang::RuntimeContext *context);
/// \brief Run Keyswitch on GPU.
///
/// It handles memory copy of the different arguments from CPU to GPU, and
/// freeing memory.
///
/// \param out_allocated
/// \param out_aligned
/// \param out_offset
/// \param out_size
/// \param out_stride
/// \param ct0_allocated
/// \param ct0_aligned
/// \param ct0_offset
/// \param ct0_size
/// \param ct0_stride
/// \param level
/// \param base_log
/// \param input_lwe_dim LWE input dimension
/// \param output_lwe_dim LWE output dimension
/// \param context
void memref_keyswitch_lwe_cuda_u64(
// Batched CUDA function //////////////////////////////////////////////////////
void memref_batched_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context);
/// \brief Copy ciphertext from CPU to GPU.
///
/// It handles memory allocation on GPU.
///
/// \param ct_allocated
/// \param ct_aligned
/// \param ct_offset
/// \param ct_size
/// \param ct_stride
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
/// \return void* pointer to the GPU ciphertext
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size,
uint64_t ct_stride, uint32_t gpu_idx,
void *stream);
/// \brief Copy ciphertext from GPU to CPU.
///
/// Memory on GPU won't be freed after the copy.
///
/// \param out_allocated
/// \param out_aligned
/// \param out_offset
/// \param out_size
/// \param out_stride
/// \param ct_gpu
/// \param size
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size,
uint64_t out_stride, void *ct_gpu, size_t size,
uint32_t gpu_idx, void *stream);
/// \brief Copy bootstrapping key from CPU to GPU.
///
/// It handles memory allocation on GPU, as well as conversion to the Fourier
/// domain.
///
/// \param context
/// \param input_lwe_dim
/// \param poly_size
/// \param level
/// \param glwe_dim
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
/// \return void* pointer to the GPU bsk
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t glwe_dim,
uint32_t gpu_idx, void *stream);
/// \brief Copy keyswitching key from CPU to GPU.
///
/// It handles memory allocation on GPU.
///
/// \param context
/// \param level
/// \param input_lwe_dim
/// \param output_lwe_dim
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
/// \return void* pointer to the GPU ksk
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t level, uint32_t input_lwe_dim,
uint32_t output_lwe_dim, uint32_t gpu_idx,
void *stream);
/// \brief Free gpu memory.
///
/// \param gpu_ptr pointer to the GPU memory to free
/// \param gpu_idx index of the GPU to use
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx);
void memref_batched_bootstrap_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size,
uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
mlir::concretelang::RuntimeContext *context);
}
#endif

View File

@@ -34,6 +34,10 @@ char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
char memref_await_future[] = "memref_await_future";
char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64";
char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
char memref_batched_keyswitch_lwe_cuda_u64[] =
"memref_batched_keyswitch_lwe_cuda_u64";
char memref_batched_bootstrap_lwe_cuda_u64[] =
"memref_batched_bootstrap_lwe_cuda_u64";
char memref_expand_lut_in_trivial_glwe_ct_u64[] =
"memref_expand_lut_in_trivial_glwe_ct_u64";
@@ -116,12 +120,14 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
memref1DType, i32Type, i32Type, i32Type,
i32Type, i32Type, i32Type, contextType},
{futureType});
} else if (funcName == memref_batched_keyswitch_lwe_u64) {
} else if (funcName == memref_batched_keyswitch_lwe_u64 ||
funcName == memref_batched_keyswitch_lwe_cuda_u64) {
funcType = mlir::FunctionType::get(rewriter.getContext(),
{memref2DType, memref2DType, i32Type,
i32Type, i32Type, i32Type, contextType},
{});
} else if (funcName == memref_batched_bootstrap_lwe_u64) {
} else if (funcName == memref_batched_bootstrap_lwe_u64 ||
funcName == memref_batched_bootstrap_lwe_cuda_u64) {
funcType = mlir::FunctionType::get(rewriter.getContext(),
{memref2DType, memref2DType,
memref1DType, i32Type, i32Type, i32Type,
@@ -335,6 +341,16 @@ struct BConcreteToCAPIPass : public BConcreteToCAPIBase<BConcreteToCAPIPass> {
patterns.add<BConcreteToCAPICallPattern<BConcrete::BootstrapLweBufferOp,
memref_bootstrap_lwe_cuda_u64>>(
&getContext(), bootstrapAddOperands<BConcrete::BootstrapLweBufferOp>);
patterns.add<
BConcreteToCAPICallPattern<BConcrete::BatchedKeySwitchLweBufferOp,
memref_batched_keyswitch_lwe_cuda_u64>>(
&getContext(),
keyswitchAddOperands<BConcrete::BatchedKeySwitchLweBufferOp>);
patterns.add<
BConcreteToCAPICallPattern<BConcrete::BatchedBootstrapLweBufferOp,
memref_batched_bootstrap_lwe_cuda_u64>>(
&getContext(),
bootstrapAddOperands<BConcrete::BatchedBootstrapLweBufferOp>);
} else {
patterns.add<BConcreteToCAPICallPattern<BConcrete::KeySwitchLweBufferOp,
memref_keyswitch_lwe_u64>>(

View File

@@ -56,50 +56,7 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size,
#ifdef CONCRETELANG_CUDA_SUPPORT
void memref_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context) {
// we currently just use the first GPU, but this should be decided
// dynamically, or during compilation, in the future
uint32_t gpu_idx = 0;
uint32_t num_samples = 1;
void *stream = cuda_create_stream(gpu_idx);
// move input ciphertext into gpu
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
ct0_size, ct0_stride, gpu_idx, stream);
// move output ciphertext into gpu
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
out_size, out_stride, gpu_idx, stream);
void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
output_lwe_dim, gpu_idx, stream);
cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
input_lwe_dim, output_lwe_dim,
base_log, level, num_samples);
// copy output ciphertext back to cpu
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
out_stride, out_gpu, out_size, gpu_idx, stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_drop(ksk_gpu, gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
}
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size,
uint64_t ct_stride, uint32_t gpu_idx,
void *stream) {
size_t buf_size = ct_size * sizeof(uint64_t);
void *ct_gpu = cuda_malloc(buf_size, gpu_idx);
cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream,
gpu_idx);
return ct_gpu;
}
// CUDA memory utils function /////////////////////////////////////////////////
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
@@ -117,18 +74,47 @@ void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
stream);
}
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size,
uint64_t out_stride, void *ct_gpu, size_t size,
uint32_t gpu_idx, void *stream) {
cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
size * sizeof(uint64_t), stream, gpu_idx);
void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
uint64_t buf_size, uint32_t gpu_idx,
void *stream) {
size_t buf_size_ = buf_size * sizeof(uint64_t);
void *ct_gpu = cuda_malloc(buf_size_, gpu_idx);
cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_, stream,
gpu_idx);
return ct_gpu;
}
void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset,
uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx,
void *stream) {
cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu,
buf_size * sizeof(uint64_t), stream, gpu_idx);
}
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
cuda_drop(gpu_ptr, gpu_idx);
}
// Single ciphertext CUDA functions ///////////////////////////////////////////
void memref_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context) {
assert(out_stride == 1);
assert(ct0_stride == 1);
memref_batched_keyswitch_lwe_cuda_u64(
// Output 1D memref as 2D memref
out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride,
// Output 1D memref as 2D memref
ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride,
// Keyswitch additional arguments
level, base_log, input_lwe_dim, output_lwe_dim, context);
}
void memref_bootstrap_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
@@ -138,20 +124,96 @@ void memref_bootstrap_lwe_cuda_u64(
uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
mlir::concretelang::RuntimeContext *context) {
// we currently just use the first GPU, but this should be decided
// dynamically, or during compilation, in the future
memref_batched_bootstrap_lwe_cuda_u64(
// Output 1D memref as 2D memref
out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride,
// Input 1D memref as 2D memref
ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride,
// Table lookup memref
tlu_allocated, tlu_aligned, tlu_offset, tlu_size, tlu_stride,
// Bootstrap additional arguments
input_lwe_dim, poly_size, level, base_log, glwe_dim, precision, context);
}
// Batched CUDA function //////////////////////////////////////////////////////
void memref_batched_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context) {
assert(out_size0 == ct0_size0);
assert(out_size1 == output_lwe_dim+1);
assert(ct0_size1 == input_lwe_dim+1);
// TODO: Multi GPU
uint32_t gpu_idx = 0;
uint32_t num_samples = out_size0;
uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
uint64_t out_batch_size = out_size0 * out_size1;
// Create the cuda stream
// TODO: Should be created by the compiler codegen
void *stream = cuda_create_stream(gpu_idx);
// move bsk to gpu
// Get the pointer on the keyswitching key on the GPU
void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
output_lwe_dim, gpu_idx, stream);
// Move the input and output batch of ciphertexts to the GPU
// TODO: The allocation should be done by the compiler codegen
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
void *out_gpu = alloc_and_memcpy_async_to_gpu(
out_aligned, out_offset, out_batch_size, gpu_idx, stream);
// Run the keyswitch kernel on the GPU
cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
input_lwe_dim, output_lwe_dim,
base_log, level, num_samples);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_drop(ksk_gpu, gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
}
void memref_batched_bootstrap_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size,
uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
mlir::concretelang::RuntimeContext *context) {
assert(out_size0 == ct0_size0);
// TODO: Multi GPU
uint32_t gpu_idx = 0;
uint32_t num_samples = out_size0;
uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
uint64_t out_batch_size = out_size0 * out_size1;
// Create the cuda stream
// TODO: Should be created by the compiler codegen
void *stream = cuda_create_stream(gpu_idx);
// Get the pointer on the bootstraping key on the GPU
void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
level, glwe_dim, gpu_idx, stream);
// move input ciphertext into gpu
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
ct0_size, ct0_stride, gpu_idx, stream);
// move output ciphertext into gpu
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
out_size, out_stride, gpu_idx, stream);
// construct LUT GLWE ciphertext
// Move the input and output batch of ciphertext to the GPU
// TODO: The allocation should be done by the compiler codegen
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
void *out_gpu = alloc_and_memcpy_async_to_gpu(
out_aligned, out_offset, out_batch_size, gpu_idx, stream);
// Construct the glwe accumulator (on CPU)
// TODO: Should be done outside of the bootstrap call, compile time if
// possible. Refactor in progress
uint64_t glwe_ct_len = poly_size * (glwe_dim + 1);
uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size);
@@ -162,35 +224,35 @@ void memref_bootstrap_lwe_cuda_u64(
default_engine_discard_trivially_encrypt_glwe_ciphertext_u64_raw_ptr_buffers(
get_levelled_engine(), glwe_ct, glwe_ct_len,
expanded_tabulated_function_array.data(), poly_size));
// move test vector into gpu
void *test_vector_gpu =
cuda_malloc(poly_size * (glwe_dim + 1) * sizeof(uint64_t), gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_gpu, (void *)glwe_ct, glwe_ct_size,
stream, gpu_idx);
// free LUT ciphertext (CPU)
// Move the glwe accumulator to the GPU
void *glwe_ct_gpu =
alloc_and_memcpy_async_to_gpu(glwe_ct, 0, glwe_ct_size, gpu_idx, stream);
// Free the glwe accumulator (on CPU)
free(glwe_ct);
// move test vector indexes into gpu
uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0;
void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t));
((uint32_t *)test_vector_idxes)[0] = 0;
void *test_vector_idxes_gpu =
cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx);
// Move test vector indexes to the GPU, the test vector indexes is set of 0
uint32_t num_test_vectors = 1, lwe_idx = 0,
test_vector_idxes_size = num_samples * sizeof(uint32_t);
void *test_vector_idxes = malloc(test_vector_idxes_size);
memset(test_vector_idxes, 0, test_vector_idxes_size);
void *test_vector_idxes_gpu = cuda_malloc(test_vector_idxes_size, gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
num_samples * sizeof(uint32_t), stream, gpu_idx);
// run gpu bootstrap
test_vector_idxes_size, stream, gpu_idx);
// Run the bootstrap kernel on the GPU
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu,
fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level,
num_samples, num_test_vectors, lwe_idx,
cuda_get_max_shared_memory(gpu_idx));
// copy output ciphertext back to cpu
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
out_stride, out_gpu, out_size, gpu_idx, stream);
stream, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, fbsk_gpu,
input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples,
num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu,
gpu_idx, stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_drop(test_vector_gpu, gpu_idx);
cuda_drop(glwe_ct_gpu, gpu_idx);
cuda_drop(test_vector_idxes_gpu, gpu_idx);
cuda_destroy_stream(stream, gpu_idx);