mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-09 03:55:04 -05:00
feat(compiler): Handle batched operators for gpu codegen
This commit is contained in:
@@ -149,33 +149,24 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
|
||||
uint64_t *dst_aligned, uint64_t dst_offset,
|
||||
uint64_t dst_size, uint64_t dst_stride);
|
||||
|
||||
// Single ciphertext CUDA functions ///////////////////////////////////////////
|
||||
|
||||
/// \brief Run Keyswitch on GPU.
|
||||
///
|
||||
/// It handles memory copy of the different arguments from CPU to GPU, and
|
||||
/// freeing memory.
|
||||
void memref_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
|
||||
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
|
||||
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context);
|
||||
|
||||
/// \brief Run bootstrapping on GPU.
|
||||
///
|
||||
/// It handles memory copy of the different arguments from CPU to GPU, and
|
||||
/// freeing memory.
|
||||
///
|
||||
/// \param out_allocated
|
||||
/// \param out_aligned
|
||||
/// \param out_offset
|
||||
/// \param out_size
|
||||
/// \param out_stride
|
||||
/// \param ct0_allocated
|
||||
/// \param ct0_aligned
|
||||
/// \param ct0_offset
|
||||
/// \param ct0_size
|
||||
/// \param ct0_stride
|
||||
/// \param tlu_allocated
|
||||
/// \param tlu_aligned
|
||||
/// \param tlu_offset
|
||||
/// \param tlu_size
|
||||
/// \param tlu_stride
|
||||
/// \param input_lwe_dim LWE input dimension
|
||||
/// \param poly_size polynomial size
|
||||
/// \param level level
|
||||
/// \param base_log base log
|
||||
/// \param glwe_dim
|
||||
/// \param precision
|
||||
/// \param context
|
||||
void memref_bootstrap_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
@@ -186,107 +177,26 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
|
||||
mlir::concretelang::RuntimeContext *context);
|
||||
|
||||
/// \brief Run Keyswitch on GPU.
|
||||
///
|
||||
/// It handles memory copy of the different arguments from CPU to GPU, and
|
||||
/// freeing memory.
|
||||
///
|
||||
/// \param out_allocated
|
||||
/// \param out_aligned
|
||||
/// \param out_offset
|
||||
/// \param out_size
|
||||
/// \param out_stride
|
||||
/// \param ct0_allocated
|
||||
/// \param ct0_aligned
|
||||
/// \param ct0_offset
|
||||
/// \param ct0_size
|
||||
/// \param ct0_stride
|
||||
/// \param level
|
||||
/// \param base_log
|
||||
/// \param input_lwe_dim LWE input dimension
|
||||
/// \param output_lwe_dim LWE output dimension
|
||||
/// \param context
|
||||
void memref_keyswitch_lwe_cuda_u64(
|
||||
// Batched CUDA function //////////////////////////////////////////////////////
|
||||
|
||||
void memref_batched_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
|
||||
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
|
||||
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
|
||||
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
|
||||
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
|
||||
uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
|
||||
uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context);
|
||||
|
||||
/// \brief Copy ciphertext from CPU to GPU.
|
||||
///
|
||||
/// It handles memory allocation on GPU.
|
||||
///
|
||||
/// \param ct_allocated
|
||||
/// \param ct_aligned
|
||||
/// \param ct_offset
|
||||
/// \param ct_size
|
||||
/// \param ct_stride
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
/// \return void* pointer to the GPU ciphertext
|
||||
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
uint64_t ct_offset, uint64_t ct_size,
|
||||
uint64_t ct_stride, uint32_t gpu_idx,
|
||||
void *stream);
|
||||
|
||||
/// \brief Copy ciphertext from GPU to CPU.
|
||||
///
|
||||
/// Memory on GPU won't be freed after the copy.
|
||||
///
|
||||
/// \param out_allocated
|
||||
/// \param out_aligned
|
||||
/// \param out_offset
|
||||
/// \param out_size
|
||||
/// \param out_stride
|
||||
/// \param ct_gpu
|
||||
/// \param size
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
uint64_t out_offset, uint64_t out_size,
|
||||
uint64_t out_stride, void *ct_gpu, size_t size,
|
||||
uint32_t gpu_idx, void *stream);
|
||||
|
||||
/// \brief Copy bootstrapping key from CPU to GPU.
|
||||
///
|
||||
/// It handles memory allocation on GPU, as well as conversion to the Fourier
|
||||
/// domain.
|
||||
///
|
||||
/// \param context
|
||||
/// \param input_lwe_dim
|
||||
/// \param poly_size
|
||||
/// \param level
|
||||
/// \param glwe_dim
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
/// \return void* pointer to the GPU bsk
|
||||
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t glwe_dim,
|
||||
uint32_t gpu_idx, void *stream);
|
||||
|
||||
/// \brief Copy keyswitching key from CPU to GPU.
|
||||
///
|
||||
/// It handles memory allocation on GPU.
|
||||
///
|
||||
/// \param context
|
||||
/// \param level
|
||||
/// \param input_lwe_dim
|
||||
/// \param output_lwe_dim
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
/// \return void* pointer to the GPU ksk
|
||||
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t level, uint32_t input_lwe_dim,
|
||||
uint32_t output_lwe_dim, uint32_t gpu_idx,
|
||||
void *stream);
|
||||
|
||||
/// \brief Free gpu memory.
|
||||
///
|
||||
/// \param gpu_ptr pointer to the GPU memory to free
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx);
|
||||
void memref_batched_bootstrap_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
|
||||
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
|
||||
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
|
||||
uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
|
||||
uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size,
|
||||
uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
|
||||
mlir::concretelang::RuntimeContext *context);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -34,6 +34,10 @@ char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
|
||||
char memref_await_future[] = "memref_await_future";
|
||||
char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64";
|
||||
char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
|
||||
char memref_batched_keyswitch_lwe_cuda_u64[] =
|
||||
"memref_batched_keyswitch_lwe_cuda_u64";
|
||||
char memref_batched_bootstrap_lwe_cuda_u64[] =
|
||||
"memref_batched_bootstrap_lwe_cuda_u64";
|
||||
char memref_expand_lut_in_trivial_glwe_ct_u64[] =
|
||||
"memref_expand_lut_in_trivial_glwe_ct_u64";
|
||||
|
||||
@@ -116,12 +120,14 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
|
||||
memref1DType, i32Type, i32Type, i32Type,
|
||||
i32Type, i32Type, i32Type, contextType},
|
||||
{futureType});
|
||||
} else if (funcName == memref_batched_keyswitch_lwe_u64) {
|
||||
} else if (funcName == memref_batched_keyswitch_lwe_u64 ||
|
||||
funcName == memref_batched_keyswitch_lwe_cuda_u64) {
|
||||
funcType = mlir::FunctionType::get(rewriter.getContext(),
|
||||
{memref2DType, memref2DType, i32Type,
|
||||
i32Type, i32Type, i32Type, contextType},
|
||||
{});
|
||||
} else if (funcName == memref_batched_bootstrap_lwe_u64) {
|
||||
} else if (funcName == memref_batched_bootstrap_lwe_u64 ||
|
||||
funcName == memref_batched_bootstrap_lwe_cuda_u64) {
|
||||
funcType = mlir::FunctionType::get(rewriter.getContext(),
|
||||
{memref2DType, memref2DType,
|
||||
memref1DType, i32Type, i32Type, i32Type,
|
||||
@@ -335,6 +341,16 @@ struct BConcreteToCAPIPass : public BConcreteToCAPIBase<BConcreteToCAPIPass> {
|
||||
patterns.add<BConcreteToCAPICallPattern<BConcrete::BootstrapLweBufferOp,
|
||||
memref_bootstrap_lwe_cuda_u64>>(
|
||||
&getContext(), bootstrapAddOperands<BConcrete::BootstrapLweBufferOp>);
|
||||
patterns.add<
|
||||
BConcreteToCAPICallPattern<BConcrete::BatchedKeySwitchLweBufferOp,
|
||||
memref_batched_keyswitch_lwe_cuda_u64>>(
|
||||
&getContext(),
|
||||
keyswitchAddOperands<BConcrete::BatchedKeySwitchLweBufferOp>);
|
||||
patterns.add<
|
||||
BConcreteToCAPICallPattern<BConcrete::BatchedBootstrapLweBufferOp,
|
||||
memref_batched_bootstrap_lwe_cuda_u64>>(
|
||||
&getContext(),
|
||||
bootstrapAddOperands<BConcrete::BatchedBootstrapLweBufferOp>);
|
||||
} else {
|
||||
patterns.add<BConcreteToCAPICallPattern<BConcrete::KeySwitchLweBufferOp,
|
||||
memref_keyswitch_lwe_u64>>(
|
||||
|
||||
@@ -56,50 +56,7 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size,
|
||||
|
||||
#ifdef CONCRETELANG_CUDA_SUPPORT
|
||||
|
||||
void memref_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
|
||||
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
|
||||
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context) {
|
||||
// we currently just use the first GPU, but this should be decided
|
||||
// dynamically, or during compilation, in the future
|
||||
uint32_t gpu_idx = 0;
|
||||
uint32_t num_samples = 1;
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
// move input ciphertext into gpu
|
||||
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
|
||||
ct0_size, ct0_stride, gpu_idx, stream);
|
||||
// move output ciphertext into gpu
|
||||
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
|
||||
out_size, out_stride, gpu_idx, stream);
|
||||
void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
|
||||
output_lwe_dim, gpu_idx, stream);
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
|
||||
input_lwe_dim, output_lwe_dim,
|
||||
base_log, level, num_samples);
|
||||
// copy output ciphertext back to cpu
|
||||
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
|
||||
out_stride, out_gpu, out_size, gpu_idx, stream);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
// free memory that we allocated on gpu
|
||||
cuda_drop(ct0_gpu, gpu_idx);
|
||||
cuda_drop(out_gpu, gpu_idx);
|
||||
cuda_drop(ksk_gpu, gpu_idx);
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
}
|
||||
|
||||
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
uint64_t ct_offset, uint64_t ct_size,
|
||||
uint64_t ct_stride, uint32_t gpu_idx,
|
||||
void *stream) {
|
||||
size_t buf_size = ct_size * sizeof(uint64_t);
|
||||
void *ct_gpu = cuda_malloc(buf_size, gpu_idx);
|
||||
cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream,
|
||||
gpu_idx);
|
||||
return ct_gpu;
|
||||
}
|
||||
// CUDA memory utils function /////////////////////////////////////////////////
|
||||
|
||||
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
@@ -117,18 +74,47 @@ void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
stream);
|
||||
}
|
||||
|
||||
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
uint64_t out_offset, uint64_t out_size,
|
||||
uint64_t out_stride, void *ct_gpu, size_t size,
|
||||
uint32_t gpu_idx, void *stream) {
|
||||
cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
|
||||
size * sizeof(uint64_t), stream, gpu_idx);
|
||||
void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
|
||||
uint64_t buf_size, uint32_t gpu_idx,
|
||||
void *stream) {
|
||||
size_t buf_size_ = buf_size * sizeof(uint64_t);
|
||||
void *ct_gpu = cuda_malloc(buf_size_, gpu_idx);
|
||||
cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_, stream,
|
||||
gpu_idx);
|
||||
return ct_gpu;
|
||||
}
|
||||
|
||||
void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset,
|
||||
uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx,
|
||||
void *stream) {
|
||||
cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu,
|
||||
buf_size * sizeof(uint64_t), stream, gpu_idx);
|
||||
}
|
||||
|
||||
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
|
||||
cuda_drop(gpu_ptr, gpu_idx);
|
||||
}
|
||||
|
||||
// Single ciphertext CUDA functions ///////////////////////////////////////////
|
||||
|
||||
void memref_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
|
||||
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
|
||||
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context) {
|
||||
assert(out_stride == 1);
|
||||
assert(ct0_stride == 1);
|
||||
memref_batched_keyswitch_lwe_cuda_u64(
|
||||
// Output 1D memref as 2D memref
|
||||
out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride,
|
||||
// Output 1D memref as 2D memref
|
||||
ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride,
|
||||
// Keyswitch additional arguments
|
||||
level, base_log, input_lwe_dim, output_lwe_dim, context);
|
||||
}
|
||||
|
||||
void memref_bootstrap_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
@@ -138,20 +124,96 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
|
||||
uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
|
||||
mlir::concretelang::RuntimeContext *context) {
|
||||
// we currently just use the first GPU, but this should be decided
|
||||
// dynamically, or during compilation, in the future
|
||||
memref_batched_bootstrap_lwe_cuda_u64(
|
||||
// Output 1D memref as 2D memref
|
||||
out_allocated, out_aligned, out_offset, 1, out_size, out_size, out_stride,
|
||||
// Input 1D memref as 2D memref
|
||||
ct0_allocated, ct0_aligned, ct0_offset, 1, ct0_size, ct0_size, ct0_stride,
|
||||
// Table lookup memref
|
||||
tlu_allocated, tlu_aligned, tlu_offset, tlu_size, tlu_stride,
|
||||
// Bootstrap additional arguments
|
||||
input_lwe_dim, poly_size, level, base_log, glwe_dim, precision, context);
|
||||
}
|
||||
|
||||
// Batched CUDA function //////////////////////////////////////////////////////
|
||||
|
||||
void memref_batched_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
|
||||
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
|
||||
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
|
||||
uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
|
||||
uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context) {
|
||||
assert(out_size0 == ct0_size0);
|
||||
assert(out_size1 == output_lwe_dim+1);
|
||||
assert(ct0_size1 == input_lwe_dim+1);
|
||||
// TODO: Multi GPU
|
||||
uint32_t gpu_idx = 0;
|
||||
uint32_t num_samples = out_size0;
|
||||
uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
|
||||
uint64_t out_batch_size = out_size0 * out_size1;
|
||||
|
||||
// Create the cuda stream
|
||||
// TODO: Should be created by the compiler codegen
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
// move bsk to gpu
|
||||
// Get the pointer on the keyswitching key on the GPU
|
||||
void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
|
||||
output_lwe_dim, gpu_idx, stream);
|
||||
// Move the input and output batch of ciphertexts to the GPU
|
||||
// TODO: The allocation should be done by the compiler codegen
|
||||
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
|
||||
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
|
||||
void *out_gpu = alloc_and_memcpy_async_to_gpu(
|
||||
out_aligned, out_offset, out_batch_size, gpu_idx, stream);
|
||||
// Run the keyswitch kernel on the GPU
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
|
||||
input_lwe_dim, output_lwe_dim,
|
||||
base_log, level, num_samples);
|
||||
// Copy the output batch of ciphertext back to CPU
|
||||
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
|
||||
stream);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
// free memory that we allocated on gpu
|
||||
cuda_drop(ct0_gpu, gpu_idx);
|
||||
cuda_drop(out_gpu, gpu_idx);
|
||||
cuda_drop(ksk_gpu, gpu_idx);
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
}
|
||||
|
||||
void memref_batched_bootstrap_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
|
||||
uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
|
||||
uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
|
||||
uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
|
||||
uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size,
|
||||
uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
|
||||
mlir::concretelang::RuntimeContext *context) {
|
||||
assert(out_size0 == ct0_size0);
|
||||
// TODO: Multi GPU
|
||||
uint32_t gpu_idx = 0;
|
||||
uint32_t num_samples = out_size0;
|
||||
uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
|
||||
uint64_t out_batch_size = out_size0 * out_size1;
|
||||
|
||||
// Create the cuda stream
|
||||
// TODO: Should be created by the compiler codegen
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
// Get the pointer on the bootstraping key on the GPU
|
||||
void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
|
||||
level, glwe_dim, gpu_idx, stream);
|
||||
// move input ciphertext into gpu
|
||||
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
|
||||
ct0_size, ct0_stride, gpu_idx, stream);
|
||||
// move output ciphertext into gpu
|
||||
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
|
||||
out_size, out_stride, gpu_idx, stream);
|
||||
// construct LUT GLWE ciphertext
|
||||
// Move the input and output batch of ciphertext to the GPU
|
||||
// TODO: The allocation should be done by the compiler codegen
|
||||
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
|
||||
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
|
||||
void *out_gpu = alloc_and_memcpy_async_to_gpu(
|
||||
out_aligned, out_offset, out_batch_size, gpu_idx, stream);
|
||||
|
||||
// Construct the glwe accumulator (on CPU)
|
||||
// TODO: Should be done outside of the bootstrap call, compile time if
|
||||
// possible. Refactor in progress
|
||||
uint64_t glwe_ct_len = poly_size * (glwe_dim + 1);
|
||||
uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
|
||||
uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size);
|
||||
@@ -162,35 +224,35 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
default_engine_discard_trivially_encrypt_glwe_ciphertext_u64_raw_ptr_buffers(
|
||||
get_levelled_engine(), glwe_ct, glwe_ct_len,
|
||||
expanded_tabulated_function_array.data(), poly_size));
|
||||
// move test vector into gpu
|
||||
void *test_vector_gpu =
|
||||
cuda_malloc(poly_size * (glwe_dim + 1) * sizeof(uint64_t), gpu_idx);
|
||||
cuda_memcpy_async_to_gpu(test_vector_gpu, (void *)glwe_ct, glwe_ct_size,
|
||||
stream, gpu_idx);
|
||||
// free LUT ciphertext (CPU)
|
||||
|
||||
// Move the glwe accumulator to the GPU
|
||||
void *glwe_ct_gpu =
|
||||
alloc_and_memcpy_async_to_gpu(glwe_ct, 0, glwe_ct_size, gpu_idx, stream);
|
||||
|
||||
// Free the glwe accumulator (on CPU)
|
||||
free(glwe_ct);
|
||||
// move test vector indexes into gpu
|
||||
uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0;
|
||||
void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t));
|
||||
((uint32_t *)test_vector_idxes)[0] = 0;
|
||||
void *test_vector_idxes_gpu =
|
||||
cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx);
|
||||
|
||||
// Move test vector indexes to the GPU, the test vector indexes is set of 0
|
||||
uint32_t num_test_vectors = 1, lwe_idx = 0,
|
||||
test_vector_idxes_size = num_samples * sizeof(uint32_t);
|
||||
void *test_vector_idxes = malloc(test_vector_idxes_size);
|
||||
memset(test_vector_idxes, 0, test_vector_idxes_size);
|
||||
void *test_vector_idxes_gpu = cuda_malloc(test_vector_idxes_size, gpu_idx);
|
||||
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
|
||||
num_samples * sizeof(uint32_t), stream, gpu_idx);
|
||||
// run gpu bootstrap
|
||||
test_vector_idxes_size, stream, gpu_idx);
|
||||
// Run the bootstrap kernel on the GPU
|
||||
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu,
|
||||
fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level,
|
||||
num_samples, num_test_vectors, lwe_idx,
|
||||
cuda_get_max_shared_memory(gpu_idx));
|
||||
// copy output ciphertext back to cpu
|
||||
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
|
||||
out_stride, out_gpu, out_size, gpu_idx, stream);
|
||||
stream, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, fbsk_gpu,
|
||||
input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples,
|
||||
num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
|
||||
// Copy the output batch of ciphertext back to CPU
|
||||
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu,
|
||||
gpu_idx, stream);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
// free memory that we allocated on gpu
|
||||
cuda_drop(ct0_gpu, gpu_idx);
|
||||
cuda_drop(out_gpu, gpu_idx);
|
||||
cuda_drop(test_vector_gpu, gpu_idx);
|
||||
cuda_drop(glwe_ct_gpu, gpu_idx);
|
||||
cuda_drop(test_vector_idxes_gpu, gpu_idx);
|
||||
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
|
||||
Reference in New Issue
Block a user