From 3c616af622ab2b37e5f6dadc3dca15adaed77066 Mon Sep 17 00:00:00 2001 From: Quentin Bourgerie Date: Tue, 29 Nov 2022 14:36:24 +0100 Subject: [PATCH] feat(compiler): Handle batched operators for gpu codegen --- .../include/concretelang/Runtime/wrappers.h | 154 +++--------- .../BConcreteToCAPI/BConcreteToCAPI.cpp | 20 +- compiler/lib/Runtime/wrappers.cpp | 226 +++++++++++------- 3 files changed, 194 insertions(+), 206 deletions(-) diff --git a/compiler/include/concretelang/Runtime/wrappers.h b/compiler/include/concretelang/Runtime/wrappers.h index 8242b1307..8bf522368 100644 --- a/compiler/include/concretelang/Runtime/wrappers.h +++ b/compiler/include/concretelang/Runtime/wrappers.h @@ -149,33 +149,24 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned, uint64_t *dst_aligned, uint64_t dst_offset, uint64_t dst_size, uint64_t dst_stride); +// Single ciphertext CUDA functions /////////////////////////////////////////// + +/// \brief Run Keyswitch on GPU. +/// +/// It handles memory copy of the different arguments from CPU to GPU, and +/// freeing memory. +void memref_keyswitch_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, + uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, + uint64_t ct0_stride, uint32_t level, uint32_t base_log, + uint32_t input_lwe_dim, uint32_t output_lwe_dim, + mlir::concretelang::RuntimeContext *context); + /// \brief Run bootstrapping on GPU. /// /// It handles memory copy of the different arguments from CPU to GPU, and /// freeing memory. -/// -/// \param out_allocated -/// \param out_aligned -/// \param out_offset -/// \param out_size -/// \param out_stride -/// \param ct0_allocated -/// \param ct0_aligned -/// \param ct0_offset -/// \param ct0_size -/// \param ct0_stride -/// \param tlu_allocated -/// \param tlu_aligned -/// \param tlu_offset -/// \param tlu_size -/// \param tlu_stride -/// \param input_lwe_dim LWE input dimension -/// \param poly_size polynomial size -/// \param level level -/// \param base_log base log -/// \param glwe_dim -/// \param precision -/// \param context void memref_bootstrap_lwe_cuda_u64( uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, @@ -186,107 +177,26 @@ void memref_bootstrap_lwe_cuda_u64( uint32_t base_log, uint32_t glwe_dim, uint32_t precision, mlir::concretelang::RuntimeContext *context); -/// \brief Run Keyswitch on GPU. -/// -/// It handles memory copy of the different arguments from CPU to GPU, and -/// freeing memory. -/// -/// \param out_allocated -/// \param out_aligned -/// \param out_offset -/// \param out_size -/// \param out_stride -/// \param ct0_allocated -/// \param ct0_aligned -/// \param ct0_offset -/// \param ct0_size -/// \param ct0_stride -/// \param level -/// \param base_log -/// \param input_lwe_dim LWE input dimension -/// \param output_lwe_dim LWE output dimension -/// \param context -void memref_keyswitch_lwe_cuda_u64( +// Batched CUDA function ////////////////////////////////////////////////////// + +void memref_batched_keyswitch_lwe_cuda_u64( uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, - uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, - uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, - uint64_t ct0_stride, uint32_t level, uint32_t base_log, - uint32_t input_lwe_dim, uint32_t output_lwe_dim, + uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0, + uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned, + uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1, + uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level, + uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim, mlir::concretelang::RuntimeContext *context); -/// \brief Copy ciphertext from CPU to GPU. -/// -/// It handles memory allocation on GPU. -/// -/// \param ct_allocated -/// \param ct_aligned -/// \param ct_offset -/// \param ct_size -/// \param ct_stride -/// \param gpu_idx index of the GPU to use -/// \param stream cuda stream to use for the copy -/// \return void* pointer to the GPU ciphertext -void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, - uint64_t ct_offset, uint64_t ct_size, - uint64_t ct_stride, uint32_t gpu_idx, - void *stream); - -/// \brief Copy ciphertext from GPU to CPU. -/// -/// Memory on GPU won't be freed after the copy. -/// -/// \param out_allocated -/// \param out_aligned -/// \param out_offset -/// \param out_size -/// \param out_stride -/// \param ct_gpu -/// \param size -/// \param gpu_idx index of the GPU to use -/// \param stream cuda stream to use for the copy -void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, - uint64_t out_offset, uint64_t out_size, - uint64_t out_stride, void *ct_gpu, size_t size, - uint32_t gpu_idx, void *stream); - -/// \brief Copy bootstrapping key from CPU to GPU. -/// -/// It handles memory allocation on GPU, as well as conversion to the Fourier -/// domain. -/// -/// \param context -/// \param input_lwe_dim -/// \param poly_size -/// \param level -/// \param glwe_dim -/// \param gpu_idx index of the GPU to use -/// \param stream cuda stream to use for the copy -/// \return void* pointer to the GPU bsk -void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, - uint32_t input_lwe_dim, uint32_t poly_size, - uint32_t level, uint32_t glwe_dim, - uint32_t gpu_idx, void *stream); - -/// \brief Copy keyswitching key from CPU to GPU. -/// -/// It handles memory allocation on GPU. -/// -/// \param context -/// \param level -/// \param input_lwe_dim -/// \param output_lwe_dim -/// \param gpu_idx index of the GPU to use -/// \param stream cuda stream to use for the copy -/// \return void* pointer to the GPU ksk -void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context, - uint32_t level, uint32_t input_lwe_dim, - uint32_t output_lwe_dim, uint32_t gpu_idx, - void *stream); - -/// \brief Free gpu memory. -/// -/// \param gpu_ptr pointer to the GPU memory to free -/// \param gpu_idx index of the GPU to use -void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx); +void memref_batched_bootstrap_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0, + uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned, + uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1, + uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated, + uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size, + uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size, + uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision, + mlir::concretelang::RuntimeContext *context); } #endif diff --git a/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp index e940b238a..fa2d632af 100644 --- a/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp +++ b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp @@ -34,6 +34,10 @@ char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64"; char memref_await_future[] = "memref_await_future"; char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64"; char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64"; +char memref_batched_keyswitch_lwe_cuda_u64[] = + "memref_batched_keyswitch_lwe_cuda_u64"; +char memref_batched_bootstrap_lwe_cuda_u64[] = + "memref_batched_bootstrap_lwe_cuda_u64"; char memref_expand_lut_in_trivial_glwe_ct_u64[] = "memref_expand_lut_in_trivial_glwe_ct_u64"; @@ -116,12 +120,14 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI( memref1DType, i32Type, i32Type, i32Type, i32Type, i32Type, i32Type, contextType}, {futureType}); - } else if (funcName == memref_batched_keyswitch_lwe_u64) { + } else if (funcName == memref_batched_keyswitch_lwe_u64 || + funcName == memref_batched_keyswitch_lwe_cuda_u64) { funcType = mlir::FunctionType::get(rewriter.getContext(), {memref2DType, memref2DType, i32Type, i32Type, i32Type, i32Type, contextType}, {}); - } else if (funcName == memref_batched_bootstrap_lwe_u64) { + } else if (funcName == memref_batched_bootstrap_lwe_u64 || + funcName == memref_batched_bootstrap_lwe_cuda_u64) { funcType = mlir::FunctionType::get(rewriter.getContext(), {memref2DType, memref2DType, memref1DType, i32Type, i32Type, i32Type, @@ -335,6 +341,16 @@ struct BConcreteToCAPIPass : public BConcreteToCAPIBase { patterns.add>( &getContext(), bootstrapAddOperands); + patterns.add< + BConcreteToCAPICallPattern>( + &getContext(), + keyswitchAddOperands); + patterns.add< + BConcreteToCAPICallPattern>( + &getContext(), + bootstrapAddOperands); } else { patterns.add>( diff --git a/compiler/lib/Runtime/wrappers.cpp b/compiler/lib/Runtime/wrappers.cpp index 1abc9138c..84d7f1173 100644 --- a/compiler/lib/Runtime/wrappers.cpp +++ b/compiler/lib/Runtime/wrappers.cpp @@ -56,50 +56,7 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size, #ifdef CONCRETELANG_CUDA_SUPPORT -void memref_keyswitch_lwe_cuda_u64( - uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, - uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, - uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, - uint64_t ct0_stride, uint32_t level, uint32_t base_log, - uint32_t input_lwe_dim, uint32_t output_lwe_dim, - mlir::concretelang::RuntimeContext *context) { - // we currently just use the first GPU, but this should be decided - // dynamically, or during compilation, in the future - uint32_t gpu_idx = 0; - uint32_t num_samples = 1; - void *stream = cuda_create_stream(gpu_idx); - // move input ciphertext into gpu - void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset, - ct0_size, ct0_stride, gpu_idx, stream); - // move output ciphertext into gpu - void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset, - out_size, out_stride, gpu_idx, stream); - void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim, - output_lwe_dim, gpu_idx, stream); - cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu, - input_lwe_dim, output_lwe_dim, - base_log, level, num_samples); - // copy output ciphertext back to cpu - memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, - out_stride, out_gpu, out_size, gpu_idx, stream); - cuda_synchronize_device(gpu_idx); - // free memory that we allocated on gpu - cuda_drop(ct0_gpu, gpu_idx); - cuda_drop(out_gpu, gpu_idx); - cuda_drop(ksk_gpu, gpu_idx); - cuda_destroy_stream(stream, gpu_idx); -} - -void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, - uint64_t ct_offset, uint64_t ct_size, - uint64_t ct_stride, uint32_t gpu_idx, - void *stream) { - size_t buf_size = ct_size * sizeof(uint64_t); - void *ct_gpu = cuda_malloc(buf_size, gpu_idx); - cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream, - gpu_idx); - return ct_gpu; -} +// CUDA memory utils function ///////////////////////////////////////////////// void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, uint32_t input_lwe_dim, uint32_t poly_size, @@ -117,18 +74,47 @@ void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context, stream); } -void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, - uint64_t out_offset, uint64_t out_size, - uint64_t out_stride, void *ct_gpu, size_t size, - uint32_t gpu_idx, void *stream) { - cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu, - size * sizeof(uint64_t), stream, gpu_idx); +void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset, + uint64_t buf_size, uint32_t gpu_idx, + void *stream) { + size_t buf_size_ = buf_size * sizeof(uint64_t); + void *ct_gpu = cuda_malloc(buf_size_, gpu_idx); + cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_, stream, + gpu_idx); + return ct_gpu; +} + +void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset, + uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx, + void *stream) { + cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu, + buf_size * sizeof(uint64_t), stream, gpu_idx); } void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) { cuda_drop(gpu_ptr, gpu_idx); } +// Single ciphertext CUDA functions /////////////////////////////////////////// + +void memref_keyswitch_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, + uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, + uint64_t ct0_stride, uint32_t level, uint32_t base_log, + uint32_t input_lwe_dim, uint32_t output_lwe_dim, + mlir::concretelang::RuntimeContext *context) { + assert(out_stride == 1); + assert(ct0_stride == 1); + memref_batched_keyswitch_lwe_cuda_u64( + // Output 1D memref as 2D memref + out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride, + // Output 1D memref as 2D memref + ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride, + // Keyswitch additional arguments + level, base_log, input_lwe_dim, output_lwe_dim, context); +} + void memref_bootstrap_lwe_cuda_u64( uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, @@ -138,20 +124,96 @@ void memref_bootstrap_lwe_cuda_u64( uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision, mlir::concretelang::RuntimeContext *context) { - // we currently just use the first GPU, but this should be decided - // dynamically, or during compilation, in the future + memref_batched_bootstrap_lwe_cuda_u64( + // Output 1D memref as 2D memref + out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride, + // Input 1D memref as 2D memref + ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride, + // Table lookup memref + tlu_allocated, tlu_aligned, tlu_offset, tlu_size, tlu_stride, + // Bootstrap additional arguments + input_lwe_dim, poly_size, level, base_log, glwe_dim, precision, context); +} + +// Batched CUDA function ////////////////////////////////////////////////////// + +void memref_batched_keyswitch_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0, + uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned, + uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1, + uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level, + uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim, + mlir::concretelang::RuntimeContext *context) { + assert(out_size0 == ct0_size0); + assert(out_size1 == output_lwe_dim+1); + assert(ct0_size1 == input_lwe_dim+1); + // TODO: Multi GPU uint32_t gpu_idx = 0; + uint32_t num_samples = out_size0; + uint64_t ct0_batch_size = ct0_size0 * ct0_size1; + uint64_t out_batch_size = out_size0 * out_size1; + + // Create the cuda stream + // TODO: Should be created by the compiler codegen void *stream = cuda_create_stream(gpu_idx); - // move bsk to gpu + // Get the pointer on the keyswitching key on the GPU + void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim, + output_lwe_dim, gpu_idx, stream); + // Move the input and output batch of ciphertexts to the GPU + // TODO: The allocation should be done by the compiler codegen + void *ct0_gpu = alloc_and_memcpy_async_to_gpu( + ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream); + void *out_gpu = alloc_and_memcpy_async_to_gpu( + out_aligned, out_offset, out_batch_size, gpu_idx, stream); + // Run the keyswitch kernel on the GPU + cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu, + input_lwe_dim, output_lwe_dim, + base_log, level, num_samples); + // Copy the output batch of ciphertext back to CPU + memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx, + stream); + cuda_synchronize_device(gpu_idx); + // free memory that we allocated on gpu + cuda_drop(ct0_gpu, gpu_idx); + cuda_drop(out_gpu, gpu_idx); + cuda_drop(ksk_gpu, gpu_idx); + cuda_destroy_stream(stream, gpu_idx); +} + +void memref_batched_bootstrap_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0, + uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned, + uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1, + uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated, + uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size, + uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size, + uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision, + mlir::concretelang::RuntimeContext *context) { + assert(out_size0 == ct0_size0); + // TODO: Multi GPU + uint32_t gpu_idx = 0; + uint32_t num_samples = out_size0; + uint64_t ct0_batch_size = ct0_size0 * ct0_size1; + uint64_t out_batch_size = out_size0 * out_size1; + + // Create the cuda stream + // TODO: Should be created by the compiler codegen + void *stream = cuda_create_stream(gpu_idx); + // Get the pointer on the bootstraping key on the GPU void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size, level, glwe_dim, gpu_idx, stream); - // move input ciphertext into gpu - void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset, - ct0_size, ct0_stride, gpu_idx, stream); - // move output ciphertext into gpu - void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset, - out_size, out_stride, gpu_idx, stream); - // construct LUT GLWE ciphertext + // Move the input and output batch of ciphertext to the GPU + // TODO: The allocation should be done by the compiler codegen + void *ct0_gpu = alloc_and_memcpy_async_to_gpu( + ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream); + void *out_gpu = alloc_and_memcpy_async_to_gpu( + out_aligned, out_offset, out_batch_size, gpu_idx, stream); + + // Construct the glwe accumulator (on CPU) + // TODO: Should be done outside of the bootstrap call, compile time if + // possible. Refactor in progress uint64_t glwe_ct_len = poly_size * (glwe_dim + 1); uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t); uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size); @@ -162,35 +224,35 @@ void memref_bootstrap_lwe_cuda_u64( default_engine_discard_trivially_encrypt_glwe_ciphertext_u64_raw_ptr_buffers( get_levelled_engine(), glwe_ct, glwe_ct_len, expanded_tabulated_function_array.data(), poly_size)); - // move test vector into gpu - void *test_vector_gpu = - cuda_malloc(poly_size * (glwe_dim + 1) * sizeof(uint64_t), gpu_idx); - cuda_memcpy_async_to_gpu(test_vector_gpu, (void *)glwe_ct, glwe_ct_size, - stream, gpu_idx); - // free LUT ciphertext (CPU) + + // Move the glwe accumulator to the GPU + void *glwe_ct_gpu = + alloc_and_memcpy_async_to_gpu(glwe_ct, 0, glwe_ct_size, gpu_idx, stream); + + // Free the glwe accumulator (on CPU) free(glwe_ct); - // move test vector indexes into gpu - uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0; - void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t)); - ((uint32_t *)test_vector_idxes)[0] = 0; - void *test_vector_idxes_gpu = - cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx); + + // Move test vector indexes to the GPU, the test vector indexes is set of 0 + uint32_t num_test_vectors = 1, lwe_idx = 0, + test_vector_idxes_size = num_samples * sizeof(uint32_t); + void *test_vector_idxes = malloc(test_vector_idxes_size); + memset(test_vector_idxes, 0, test_vector_idxes_size); + void *test_vector_idxes_gpu = cuda_malloc(test_vector_idxes_size, gpu_idx); cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes, - num_samples * sizeof(uint32_t), stream, gpu_idx); - // run gpu bootstrap + test_vector_idxes_size, stream, gpu_idx); + // Run the bootstrap kernel on the GPU cuda_bootstrap_amortized_lwe_ciphertext_vector_64( - stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu, - fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level, - num_samples, num_test_vectors, lwe_idx, - cuda_get_max_shared_memory(gpu_idx)); - // copy output ciphertext back to cpu - memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, - out_stride, out_gpu, out_size, gpu_idx, stream); + stream, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, fbsk_gpu, + input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples, + num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx)); + // Copy the output batch of ciphertext back to CPU + memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, + gpu_idx, stream); cuda_synchronize_device(gpu_idx); // free memory that we allocated on gpu cuda_drop(ct0_gpu, gpu_idx); cuda_drop(out_gpu, gpu_idx); - cuda_drop(test_vector_gpu, gpu_idx); + cuda_drop(glwe_ct_gpu, gpu_idx); cuda_drop(test_vector_idxes_gpu, gpu_idx); cuda_destroy_stream(stream, gpu_idx);