mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-09 03:55:04 -05:00
feat: support GPU keyswitching
This commit is contained in:
@@ -170,4 +170,18 @@ def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer">
|
||||
let results = (outs 1DTensorOf<[I64]>:$result);
|
||||
}
|
||||
|
||||
// This is a different op in BConcrete just because of the way we are lowering to CAPI
|
||||
// When the CAPI lowering is detached from bufferization, we can remove this op, and lower
|
||||
// to the appropriate CAPI (gpu or cpu) depending on the useGPU compilation option
|
||||
def BConcrete_KeySwitchLweGPUBufferOp : BConcrete_Op<"keyswitch_lwe_gpu_buffer"> {
|
||||
let arguments = (ins
|
||||
1DTensorOf<[I64]>:$ciphertext,
|
||||
I32:$level,
|
||||
I32:$baseLog,
|
||||
I32:$lwe_dim_in,
|
||||
I32:$lwe_dim_out
|
||||
);
|
||||
let results = (outs 1DTensorOf<[I64]>:$result);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -129,8 +129,7 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
|
||||
/// \brief Run bootstrapping on GPU.
|
||||
///
|
||||
/// It handles memory copy of the different arguments from CPU to GPU, and
|
||||
/// freeing memory, except for the bootstrapping key, which should already be in
|
||||
/// GPU.
|
||||
/// freeing memory.
|
||||
///
|
||||
/// \param out_allocated
|
||||
/// \param out_aligned
|
||||
@@ -164,7 +163,35 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
|
||||
mlir::concretelang::RuntimeContext *context);
|
||||
|
||||
/// \brief Copy ciphertext from CPU to GPU using a single stream.
|
||||
/// \brief Run Keyswitch on GPU.
|
||||
///
|
||||
/// It handles memory copy of the different arguments from CPU to GPU, and
|
||||
/// freeing memory.
|
||||
///
|
||||
/// \param out_allocated
|
||||
/// \param out_aligned
|
||||
/// \param out_offset
|
||||
/// \param out_size
|
||||
/// \param out_stride
|
||||
/// \param ct0_allocated
|
||||
/// \param ct0_aligned
|
||||
/// \param ct0_offset
|
||||
/// \param ct0_size
|
||||
/// \param ct0_stride
|
||||
/// \param level
|
||||
/// \param base_log
|
||||
/// \param input_lwe_dim LWE input dimension
|
||||
/// \param output_lwe_dim LWE output dimension
|
||||
/// \param context
|
||||
void memref_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
|
||||
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
|
||||
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context);
|
||||
|
||||
/// \brief Copy ciphertext from CPU to GPU.
|
||||
///
|
||||
/// It handles memory allocation on GPU.
|
||||
///
|
||||
@@ -174,12 +201,14 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
/// \param ct_size
|
||||
/// \param ct_stride
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
/// \return void* pointer to the GPU ciphertext
|
||||
void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
|
||||
uint32_t gpu_idx);
|
||||
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
uint64_t ct_offset, uint64_t ct_size,
|
||||
uint64_t ct_stride, uint32_t gpu_idx,
|
||||
void *stream);
|
||||
|
||||
/// \brief Copy ciphertext from GPU to CPU using a single stream.
|
||||
/// \brief Copy ciphertext from GPU to CPU.
|
||||
///
|
||||
/// Memory on GPU won't be freed after the copy.
|
||||
///
|
||||
@@ -191,11 +220,13 @@ void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
/// \param ct_gpu
|
||||
/// \param size
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
|
||||
void *ct_gpu, size_t size, uint32_t gpu_idx);
|
||||
/// \param stream cuda stream to use for the copy
|
||||
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
uint64_t out_offset, uint64_t out_size,
|
||||
uint64_t out_stride, void *ct_gpu, size_t size,
|
||||
uint32_t gpu_idx, void *stream);
|
||||
|
||||
/// \brief Copy bootstrapping key from CPU to GPU using a single stream.
|
||||
/// \brief Copy bootstrapping key from CPU to GPU.
|
||||
///
|
||||
/// It handles memory allocation on GPU, as well as conversion to the Fourier
|
||||
/// domain.
|
||||
@@ -206,10 +237,28 @@ void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
/// \param level
|
||||
/// \param glwe_dim
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
/// \return void* pointer to the GPU bsk
|
||||
void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx);
|
||||
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t glwe_dim,
|
||||
uint32_t gpu_idx, void *stream);
|
||||
|
||||
/// \brief Copy keyswitching key from CPU to GPU.
|
||||
///
|
||||
/// It handles memory allocation on GPU.
|
||||
///
|
||||
/// \param context
|
||||
/// \param level
|
||||
/// \param input_lwe_dim
|
||||
/// \param output_lwe_dim
|
||||
/// \param gpu_idx index of the GPU to use
|
||||
/// \param stream cuda stream to use for the copy
|
||||
/// \return void* pointer to the GPU ksk
|
||||
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t level, uint32_t input_lwe_dim,
|
||||
uint32_t output_lwe_dim, uint32_t gpu_idx,
|
||||
void *stream);
|
||||
|
||||
/// \brief Free gpu memory.
|
||||
///
|
||||
|
||||
@@ -201,6 +201,50 @@ struct LowToBConcrete : public mlir::OpRewritePattern<ConcreteOp> {
|
||||
};
|
||||
};
|
||||
|
||||
struct KeySwitchToGPU : public mlir::OpRewritePattern<
|
||||
mlir::concretelang::Concrete::KeySwitchLweOp> {
|
||||
KeySwitchToGPU(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1)
|
||||
: ::mlir::OpRewritePattern<mlir::concretelang::Concrete::KeySwitchLweOp>(
|
||||
context, benefit) {}
|
||||
|
||||
::mlir::LogicalResult
|
||||
matchAndRewrite(mlir::concretelang::Concrete::KeySwitchLweOp keySwitchOp,
|
||||
::mlir::PatternRewriter &rewriter) const override {
|
||||
ConcreteToBConcreteTypeConverter converter;
|
||||
|
||||
mlir::Value levelCst = rewriter.create<mlir::arith::ConstantIntOp>(
|
||||
keySwitchOp.getLoc(), keySwitchOp.level(), 32);
|
||||
mlir::Value baseLogCst = rewriter.create<mlir::arith::ConstantIntOp>(
|
||||
keySwitchOp.getLoc(), keySwitchOp.baseLog(), 32);
|
||||
|
||||
// construct operands for in/out dimensions
|
||||
mlir::concretelang::Concrete::LweCiphertextType outType =
|
||||
keySwitchOp.getType();
|
||||
auto outDim = outType.getDimension();
|
||||
mlir::Value outDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
|
||||
keySwitchOp.getLoc(), outDim, 32);
|
||||
auto inputType =
|
||||
keySwitchOp.ciphertext()
|
||||
.getType()
|
||||
.cast<mlir::concretelang::Concrete::LweCiphertextType>();
|
||||
auto inputDim = inputType.getDimension();
|
||||
mlir::Value inputDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
|
||||
keySwitchOp.getLoc(), inputDim, 32);
|
||||
|
||||
mlir::Operation *bKeySwitchGPUOp = rewriter.replaceOpWithNewOp<
|
||||
mlir::concretelang::BConcrete::KeySwitchLweGPUBufferOp>(
|
||||
keySwitchOp, outType, keySwitchOp.ciphertext(), levelCst, baseLogCst,
|
||||
inputDimCst, outDimCst);
|
||||
|
||||
mlir::concretelang::convertOperandAndResultTypes(
|
||||
rewriter, bKeySwitchGPUOp, [&](mlir::MLIRContext *, mlir::Type t) {
|
||||
return converter.convertType(t);
|
||||
});
|
||||
|
||||
return ::mlir::success();
|
||||
};
|
||||
};
|
||||
|
||||
struct AddPlaintextLweCiphertextOpPattern
|
||||
: public mlir::OpRewritePattern<Concrete::AddPlaintextLweCiphertextOp> {
|
||||
AddPlaintextLweCiphertextOpPattern(::mlir::MLIRContext *context,
|
||||
@@ -872,23 +916,24 @@ void ConcreteToBConcretePass::runOnOperation() {
|
||||
LowToBConcrete<mlir::concretelang::Concrete::NegateLweCiphertextOp,
|
||||
mlir::concretelang::BConcrete::NegateLweBufferOp,
|
||||
BConcrete::NegateCRTLweBufferOp>,
|
||||
LowToBConcrete<mlir::concretelang::Concrete::KeySwitchLweOp,
|
||||
mlir::concretelang::BConcrete::KeySwitchLweBufferOp,
|
||||
mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
|
||||
LowToBConcrete<Concrete::WopPBSLweOp, BConcrete::WopPBSCRTLweBufferOp,
|
||||
BConcrete::WopPBSCRTLweBufferOp>>(&getContext());
|
||||
|
||||
if (this->useGPU) {
|
||||
patterns.insert<LowToBConcrete<
|
||||
mlir::concretelang::Concrete::BootstrapLweOp,
|
||||
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
|
||||
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>>(
|
||||
&getContext());
|
||||
patterns
|
||||
.insert<LowToBConcrete<
|
||||
mlir::concretelang::Concrete::BootstrapLweOp,
|
||||
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
|
||||
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>,
|
||||
KeySwitchToGPU>(&getContext());
|
||||
} else {
|
||||
patterns.insert<
|
||||
LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweOp,
|
||||
mlir::concretelang::BConcrete::BootstrapLweBufferOp,
|
||||
mlir::concretelang::BConcrete::BootstrapLweBufferOp>>(
|
||||
mlir::concretelang::BConcrete::BootstrapLweBufferOp>,
|
||||
LowToBConcrete<mlir::concretelang::Concrete::KeySwitchLweOp,
|
||||
mlir::concretelang::BConcrete::KeySwitchLweBufferOp,
|
||||
mlir::concretelang::BConcrete::KeySwitchLweBufferOp>>(
|
||||
&getContext());
|
||||
}
|
||||
|
||||
|
||||
@@ -76,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64";
|
||||
char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64";
|
||||
char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
|
||||
char memref_await_future[] = "memref_await_future";
|
||||
char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64";
|
||||
char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
|
||||
char memref_expand_lut_in_trivial_glwe_ct_u64[] =
|
||||
"memref_expand_lut_in_trivial_glwe_ct_u64";
|
||||
@@ -112,6 +113,11 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
|
||||
} else if (funcName == memref_keyswitch_lwe_u64) {
|
||||
funcType = mlir::FunctionType::get(
|
||||
rewriter.getContext(), {memref1DType, memref1DType, contextType}, {});
|
||||
} else if (funcName == memref_keyswitch_lwe_cuda_u64) {
|
||||
funcType = mlir::FunctionType::get(rewriter.getContext(),
|
||||
{memref1DType, memref1DType, i32Type,
|
||||
i32Type, i32Type, i32Type, contextType},
|
||||
{});
|
||||
} else if (funcName == memref_bootstrap_lwe_u64) {
|
||||
funcType = mlir::FunctionType::get(rewriter.getContext(),
|
||||
{memref1DType, memref1DType,
|
||||
@@ -482,6 +488,10 @@ void mlir::concretelang::BConcrete::
|
||||
BufferizableWithCallOpInterface<BConcrete::NegateLweBufferOp,
|
||||
memref_negate_lwe_ciphertext_u64>>(
|
||||
*ctx);
|
||||
BConcrete::KeySwitchLweGPUBufferOp::attachInterface<
|
||||
BufferizableWithCallOpInterface<BConcrete::KeySwitchLweGPUBufferOp,
|
||||
memref_keyswitch_lwe_cuda_u64, true>>(
|
||||
*ctx);
|
||||
BConcrete::BootstrapLweGPUBufferOp::attachInterface<
|
||||
BufferizableWithCallOpInterface<BConcrete::BootstrapLweGPUBufferOp,
|
||||
memref_bootstrap_lwe_cuda_u64, true>>(
|
||||
|
||||
@@ -65,32 +65,57 @@ typedef struct double2 {
|
||||
// From concrete-cuda
|
||||
#include "bootstrap.h"
|
||||
#include "device.h"
|
||||
#include "keyswitch.h"
|
||||
|
||||
void memref_keyswitch_lwe_cuda_u64(
|
||||
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
|
||||
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
|
||||
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
|
||||
uint64_t ct0_stride, mlir::concretelang::RuntimeContext *context) {
|
||||
// TODO: GPU implementation
|
||||
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
|
||||
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
|
||||
mlir::concretelang::RuntimeContext *context) {
|
||||
// we currently just use the first GPU, but this should be decided
|
||||
// dynamically, or during compilation, in the future
|
||||
uint32_t gpu_idx = 0;
|
||||
uint32_t num_samples = 1;
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
// move input ciphertext into gpu
|
||||
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
|
||||
ct0_size, ct0_stride, gpu_idx, stream);
|
||||
// move output ciphertext into gpu
|
||||
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
|
||||
out_size, out_stride, gpu_idx, stream);
|
||||
void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
|
||||
output_lwe_dim, gpu_idx, stream);
|
||||
cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
|
||||
input_lwe_dim, output_lwe_dim,
|
||||
base_log, level, num_samples);
|
||||
// copy output ciphertext back to cpu
|
||||
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
|
||||
out_stride, out_gpu, out_size, gpu_idx, stream);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
// free memory that we allocated on gpu
|
||||
cuda_drop(ct0_gpu, gpu_idx);
|
||||
cuda_drop(out_gpu, gpu_idx);
|
||||
cuda_drop(ksk_gpu, gpu_idx);
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
}
|
||||
|
||||
void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
|
||||
uint32_t gpu_idx) {
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
|
||||
uint64_t ct_offset, uint64_t ct_size,
|
||||
uint64_t ct_stride, uint32_t gpu_idx,
|
||||
void *stream) {
|
||||
size_t buf_size = ct_size * sizeof(uint64_t);
|
||||
void *ct_gpu = cuda_malloc(buf_size, gpu_idx);
|
||||
cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream,
|
||||
gpu_idx);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
return ct_gpu;
|
||||
}
|
||||
|
||||
void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx = 0) {
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t input_lwe_dim, uint32_t poly_size,
|
||||
uint32_t level, uint32_t glwe_dim,
|
||||
uint32_t gpu_idx, void *stream) {
|
||||
LweBootstrapKey64 *bsk = get_bootstrap_key_u64(context);
|
||||
size_t bsk_buffer_len =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * poly_size * level;
|
||||
@@ -105,20 +130,41 @@ void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
cuda_initialize_twiddles(poly_size, gpu_idx);
|
||||
cuda_convert_lwe_bootstrap_key_64(fbsk_gpu, bsk_buffer, stream, gpu_idx,
|
||||
input_lwe_dim, glwe_dim, level, poly_size);
|
||||
// This is currently not 100% async as we have to free CPU memory after
|
||||
// conversion
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
free(bsk_buffer);
|
||||
return fbsk_gpu;
|
||||
}
|
||||
|
||||
void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
|
||||
void *ct_gpu, size_t size, uint32_t gpu_idx) {
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
|
||||
uint32_t level, uint32_t input_lwe_dim,
|
||||
uint32_t output_lwe_dim, uint32_t gpu_idx,
|
||||
void *stream) {
|
||||
LweKeyswitchKey64 *ksk = get_keyswitch_key_u64(context);
|
||||
size_t ksk_buffer_len = input_lwe_dim * (output_lwe_dim + 1) * level;
|
||||
size_t ksk_buffer_size = sizeof(uint64_t) * ksk_buffer_len;
|
||||
uint64_t *ksk_buffer =
|
||||
(uint64_t *)aligned_alloc(U64_ALIGNMENT, ksk_buffer_size);
|
||||
void *ksk_gpu = cuda_malloc(ksk_buffer_size, gpu_idx);
|
||||
CAPI_ASSERT_ERROR(
|
||||
default_engine_discard_convert_lwe_keyswitch_key_to_lwe_keyswitch_key_mut_view_u64_raw_ptr_buffers(
|
||||
get_levelled_engine(), ksk, ksk_buffer));
|
||||
cuda_memcpy_async_to_gpu(ksk_gpu, ksk_buffer, ksk_buffer_size, stream,
|
||||
gpu_idx);
|
||||
// This is currently not 100% async as we have to free CPU memory after
|
||||
// conversion
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
free(ksk_buffer);
|
||||
return ksk_gpu;
|
||||
}
|
||||
|
||||
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
|
||||
uint64_t out_offset, uint64_t out_size,
|
||||
uint64_t out_stride, void *ct_gpu, size_t size,
|
||||
uint32_t gpu_idx, void *stream) {
|
||||
cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
|
||||
size * sizeof(uint64_t), stream, gpu_idx);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
cuda_destroy_stream(stream, gpu_idx);
|
||||
}
|
||||
|
||||
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
|
||||
@@ -139,14 +185,14 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
uint32_t gpu_idx = 0;
|
||||
void *stream = cuda_create_stream(gpu_idx);
|
||||
// move bsk to gpu
|
||||
void *fbsk_gpu = move_bsk_to_gpu(context, input_lwe_dim, poly_size, level,
|
||||
glwe_dim, gpu_idx);
|
||||
void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
|
||||
level, glwe_dim, gpu_idx, stream);
|
||||
// move input ciphertext into gpu
|
||||
void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
|
||||
ct0_size, ct0_stride, gpu_idx);
|
||||
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
|
||||
ct0_size, ct0_stride, gpu_idx, stream);
|
||||
// move output ciphertext into gpu
|
||||
void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset,
|
||||
out_size, out_stride, gpu_idx);
|
||||
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
|
||||
out_size, out_stride, gpu_idx, stream);
|
||||
// construct LUT GLWE ciphertext
|
||||
uint64_t glwe_ct_len = poly_size * (glwe_dim + 1);
|
||||
uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
|
||||
@@ -179,8 +225,8 @@ void memref_bootstrap_lwe_cuda_u64(
|
||||
fbsk_gpu, input_lwe_dim, poly_size, base_log, level, num_samples,
|
||||
num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
|
||||
// copy output ciphertext back to cpu
|
||||
move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride,
|
||||
out_gpu, out_size, gpu_idx);
|
||||
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
|
||||
out_stride, out_gpu, out_size, gpu_idx, stream);
|
||||
cuda_synchronize_device(gpu_idx);
|
||||
// free memory that we allocated on gpu
|
||||
cuda_drop(fbsk_gpu, gpu_idx);
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
// RUN: concretecompiler --passes concrete-to-bconcrete --action=dump-bconcrete --use-gpu %s 2>&1| FileCheck %s
|
||||
|
||||
|
||||
//CHECK: func.func @main(%arg0: tensor<1025xi64>) -> tensor<1025xi64> {
|
||||
//CHECK: %c1_i32 = arith.constant 1 : i32
|
||||
//CHECK: %c8_i32 = arith.constant 8 : i32
|
||||
//CHECK: %c2_i32 = arith.constant 2 : i32
|
||||
//CHECK: %c1024_i32 = arith.constant 1024 : i32
|
||||
//CHECK: %c575_i32 = arith.constant 575 : i32
|
||||
//CHECK: %cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
|
||||
//CHECK: %c5_i32 = arith.constant 5 : i32
|
||||
//CHECK: %c2_i32_0 = arith.constant 2 : i32
|
||||
//CHECK: %c575_i32_1 = arith.constant 575 : i32
|
||||
//CHECK: %c1024_i32_2 = arith.constant 1024 : i32
|
||||
//CHECK: %0 = "BConcrete.keyswitch_lwe_gpu_buffer"(%arg0, %c5_i32, %c2_i32_0, %c1024_i32_2, %c575_i32_1) : (tensor<1025xi64>, i32, i32, i32, i32) -> tensor<576xi64>
|
||||
//CHECK: %1 = "BConcrete.bootstrap_lwe_gpu_buffer"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (tensor<576xi64>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> tensor<1025xi64>
|
||||
//CHECK: return %1 : tensor<1025xi64>
|
||||
//CHECK: }
|
||||
func.func @main(%arg0: !Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<1024,2> {
|
||||
%c1_i32 = arith.constant 1 : i32
|
||||
%c8_i32 = arith.constant 8 : i32
|
||||
%c2_i32 = arith.constant 2 : i32
|
||||
%c1024_i32 = arith.constant 1024 : i32
|
||||
%c575_i32 = arith.constant 575 : i32
|
||||
%cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
|
||||
%0 = "Concrete.keyswitch_lwe"(%arg0) {baseLog = 2 : i32, level = 5 : i32} : (!Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<575,2>
|
||||
%1 = "Concrete.bootstrap_lwe"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (!Concrete.lwe_ciphertext<575,2>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> !Concrete.lwe_ciphertext<1024,2>
|
||||
return %1 : !Concrete.lwe_ciphertext<1024,2>
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user