feat: support GPU keyswitching

This commit is contained in:
youben11
2022-09-20 09:25:26 +01:00
committed by Ayoub Benaissa
parent 7e08614e6c
commit d615ff47f2
6 changed files with 245 additions and 50 deletions

View File

@@ -170,4 +170,18 @@ def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer">
let results = (outs 1DTensorOf<[I64]>:$result);
}
// This is a different op in BConcrete just because of the way we are lowering to CAPI
// When the CAPI lowering is detached from bufferization, we can remove this op, and lower
// to the appropriate CAPI (gpu or cpu) depending on the useGPU compilation option
def BConcrete_KeySwitchLweGPUBufferOp : BConcrete_Op<"keyswitch_lwe_gpu_buffer"> {
let arguments = (ins
1DTensorOf<[I64]>:$ciphertext,
I32:$level,
I32:$baseLog,
I32:$lwe_dim_in,
I32:$lwe_dim_out
);
let results = (outs 1DTensorOf<[I64]>:$result);
}
#endif

View File

@@ -129,8 +129,7 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
/// \brief Run bootstrapping on GPU.
///
/// It handles memory copy of the different arguments from CPU to GPU, and
/// freeing memory, except for the bootstrapping key, which should already be in
/// GPU.
/// freeing memory.
///
/// \param out_allocated
/// \param out_aligned
@@ -164,7 +163,35 @@ void memref_bootstrap_lwe_cuda_u64(
uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
mlir::concretelang::RuntimeContext *context);
/// \brief Copy ciphertext from CPU to GPU using a single stream.
/// \brief Run Keyswitch on GPU.
///
/// It handles memory copy of the different arguments from CPU to GPU, and
/// freeing memory.
///
/// \param out_allocated
/// \param out_aligned
/// \param out_offset
/// \param out_size
/// \param out_stride
/// \param ct0_allocated
/// \param ct0_aligned
/// \param ct0_offset
/// \param ct0_size
/// \param ct0_stride
/// \param level
/// \param base_log
/// \param input_lwe_dim LWE input dimension
/// \param output_lwe_dim LWE output dimension
/// \param context
void memref_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context);
/// \brief Copy ciphertext from CPU to GPU.
///
/// It handles memory allocation on GPU.
///
@@ -174,12 +201,14 @@ void memref_bootstrap_lwe_cuda_u64(
/// \param ct_size
/// \param ct_stride
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
/// \return void* pointer to the GPU ciphertext
void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
uint32_t gpu_idx);
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size,
uint64_t ct_stride, uint32_t gpu_idx,
void *stream);
/// \brief Copy ciphertext from GPU to CPU using a single stream.
/// \brief Copy ciphertext from GPU to CPU.
///
/// Memory on GPU won't be freed after the copy.
///
@@ -191,11 +220,13 @@ void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
/// \param ct_gpu
/// \param size
/// \param gpu_idx index of the GPU to use
void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
void *ct_gpu, size_t size, uint32_t gpu_idx);
/// \param stream cuda stream to use for the copy
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size,
uint64_t out_stride, void *ct_gpu, size_t size,
uint32_t gpu_idx, void *stream);
/// \brief Copy bootstrapping key from CPU to GPU using a single stream.
/// \brief Copy bootstrapping key from CPU to GPU.
///
/// It handles memory allocation on GPU, as well as conversion to the Fourier
/// domain.
@@ -206,10 +237,28 @@ void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
/// \param level
/// \param glwe_dim
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
/// \return void* pointer to the GPU bsk
void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx);
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t glwe_dim,
uint32_t gpu_idx, void *stream);
/// \brief Copy keyswitching key from CPU to GPU.
///
/// It handles memory allocation on GPU.
///
/// \param context
/// \param level
/// \param input_lwe_dim
/// \param output_lwe_dim
/// \param gpu_idx index of the GPU to use
/// \param stream cuda stream to use for the copy
/// \return void* pointer to the GPU ksk
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t level, uint32_t input_lwe_dim,
uint32_t output_lwe_dim, uint32_t gpu_idx,
void *stream);
/// \brief Free gpu memory.
///

View File

@@ -201,6 +201,50 @@ struct LowToBConcrete : public mlir::OpRewritePattern<ConcreteOp> {
};
};
struct KeySwitchToGPU : public mlir::OpRewritePattern<
mlir::concretelang::Concrete::KeySwitchLweOp> {
KeySwitchToGPU(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1)
: ::mlir::OpRewritePattern<mlir::concretelang::Concrete::KeySwitchLweOp>(
context, benefit) {}
::mlir::LogicalResult
matchAndRewrite(mlir::concretelang::Concrete::KeySwitchLweOp keySwitchOp,
::mlir::PatternRewriter &rewriter) const override {
ConcreteToBConcreteTypeConverter converter;
mlir::Value levelCst = rewriter.create<mlir::arith::ConstantIntOp>(
keySwitchOp.getLoc(), keySwitchOp.level(), 32);
mlir::Value baseLogCst = rewriter.create<mlir::arith::ConstantIntOp>(
keySwitchOp.getLoc(), keySwitchOp.baseLog(), 32);
// construct operands for in/out dimensions
mlir::concretelang::Concrete::LweCiphertextType outType =
keySwitchOp.getType();
auto outDim = outType.getDimension();
mlir::Value outDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
keySwitchOp.getLoc(), outDim, 32);
auto inputType =
keySwitchOp.ciphertext()
.getType()
.cast<mlir::concretelang::Concrete::LweCiphertextType>();
auto inputDim = inputType.getDimension();
mlir::Value inputDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
keySwitchOp.getLoc(), inputDim, 32);
mlir::Operation *bKeySwitchGPUOp = rewriter.replaceOpWithNewOp<
mlir::concretelang::BConcrete::KeySwitchLweGPUBufferOp>(
keySwitchOp, outType, keySwitchOp.ciphertext(), levelCst, baseLogCst,
inputDimCst, outDimCst);
mlir::concretelang::convertOperandAndResultTypes(
rewriter, bKeySwitchGPUOp, [&](mlir::MLIRContext *, mlir::Type t) {
return converter.convertType(t);
});
return ::mlir::success();
};
};
struct AddPlaintextLweCiphertextOpPattern
: public mlir::OpRewritePattern<Concrete::AddPlaintextLweCiphertextOp> {
AddPlaintextLweCiphertextOpPattern(::mlir::MLIRContext *context,
@@ -872,23 +916,24 @@ void ConcreteToBConcretePass::runOnOperation() {
LowToBConcrete<mlir::concretelang::Concrete::NegateLweCiphertextOp,
mlir::concretelang::BConcrete::NegateLweBufferOp,
BConcrete::NegateCRTLweBufferOp>,
LowToBConcrete<mlir::concretelang::Concrete::KeySwitchLweOp,
mlir::concretelang::BConcrete::KeySwitchLweBufferOp,
mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
LowToBConcrete<Concrete::WopPBSLweOp, BConcrete::WopPBSCRTLweBufferOp,
BConcrete::WopPBSCRTLweBufferOp>>(&getContext());
if (this->useGPU) {
patterns.insert<LowToBConcrete<
mlir::concretelang::Concrete::BootstrapLweOp,
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>>(
&getContext());
patterns
.insert<LowToBConcrete<
mlir::concretelang::Concrete::BootstrapLweOp,
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>,
KeySwitchToGPU>(&getContext());
} else {
patterns.insert<
LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweOp,
mlir::concretelang::BConcrete::BootstrapLweBufferOp,
mlir::concretelang::BConcrete::BootstrapLweBufferOp>>(
mlir::concretelang::BConcrete::BootstrapLweBufferOp>,
LowToBConcrete<mlir::concretelang::Concrete::KeySwitchLweOp,
mlir::concretelang::BConcrete::KeySwitchLweBufferOp,
mlir::concretelang::BConcrete::KeySwitchLweBufferOp>>(
&getContext());
}

View File

@@ -76,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64";
char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64";
char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
char memref_await_future[] = "memref_await_future";
char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64";
char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
char memref_expand_lut_in_trivial_glwe_ct_u64[] =
"memref_expand_lut_in_trivial_glwe_ct_u64";
@@ -112,6 +113,11 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
} else if (funcName == memref_keyswitch_lwe_u64) {
funcType = mlir::FunctionType::get(
rewriter.getContext(), {memref1DType, memref1DType, contextType}, {});
} else if (funcName == memref_keyswitch_lwe_cuda_u64) {
funcType = mlir::FunctionType::get(rewriter.getContext(),
{memref1DType, memref1DType, i32Type,
i32Type, i32Type, i32Type, contextType},
{});
} else if (funcName == memref_bootstrap_lwe_u64) {
funcType = mlir::FunctionType::get(rewriter.getContext(),
{memref1DType, memref1DType,
@@ -482,6 +488,10 @@ void mlir::concretelang::BConcrete::
BufferizableWithCallOpInterface<BConcrete::NegateLweBufferOp,
memref_negate_lwe_ciphertext_u64>>(
*ctx);
BConcrete::KeySwitchLweGPUBufferOp::attachInterface<
BufferizableWithCallOpInterface<BConcrete::KeySwitchLweGPUBufferOp,
memref_keyswitch_lwe_cuda_u64, true>>(
*ctx);
BConcrete::BootstrapLweGPUBufferOp::attachInterface<
BufferizableWithCallOpInterface<BConcrete::BootstrapLweGPUBufferOp,
memref_bootstrap_lwe_cuda_u64, true>>(

View File

@@ -65,32 +65,57 @@ typedef struct double2 {
// From concrete-cuda
#include "bootstrap.h"
#include "device.h"
#include "keyswitch.h"
void memref_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, mlir::concretelang::RuntimeContext *context) {
// TODO: GPU implementation
uint64_t ct0_stride, uint32_t level, uint32_t base_log,
uint32_t input_lwe_dim, uint32_t output_lwe_dim,
mlir::concretelang::RuntimeContext *context) {
// we currently just use the first GPU, but this should be decided
// dynamically, or during compilation, in the future
uint32_t gpu_idx = 0;
uint32_t num_samples = 1;
void *stream = cuda_create_stream(gpu_idx);
// move input ciphertext into gpu
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
ct0_size, ct0_stride, gpu_idx, stream);
// move output ciphertext into gpu
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
out_size, out_stride, gpu_idx, stream);
void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
output_lwe_dim, gpu_idx, stream);
cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
input_lwe_dim, output_lwe_dim,
base_log, level, num_samples);
// copy output ciphertext back to cpu
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
out_stride, out_gpu, out_size, gpu_idx, stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_drop(ksk_gpu, gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
}
void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
uint32_t gpu_idx) {
void *stream = cuda_create_stream(gpu_idx);
void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size,
uint64_t ct_stride, uint32_t gpu_idx,
void *stream) {
size_t buf_size = ct_size * sizeof(uint64_t);
void *ct_gpu = cuda_malloc(buf_size, gpu_idx);
cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream,
gpu_idx);
cuda_synchronize_device(gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
return ct_gpu;
}
void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx = 0) {
void *stream = cuda_create_stream(gpu_idx);
void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t glwe_dim,
uint32_t gpu_idx, void *stream) {
LweBootstrapKey64 *bsk = get_bootstrap_key_u64(context);
size_t bsk_buffer_len =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * poly_size * level;
@@ -105,20 +130,41 @@ void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
cuda_initialize_twiddles(poly_size, gpu_idx);
cuda_convert_lwe_bootstrap_key_64(fbsk_gpu, bsk_buffer, stream, gpu_idx,
input_lwe_dim, glwe_dim, level, poly_size);
// This is currently not 100% async as we have to free CPU memory after
// conversion
cuda_synchronize_device(gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
free(bsk_buffer);
return fbsk_gpu;
}
void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
void *ct_gpu, size_t size, uint32_t gpu_idx) {
void *stream = cuda_create_stream(gpu_idx);
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t level, uint32_t input_lwe_dim,
uint32_t output_lwe_dim, uint32_t gpu_idx,
void *stream) {
LweKeyswitchKey64 *ksk = get_keyswitch_key_u64(context);
size_t ksk_buffer_len = input_lwe_dim * (output_lwe_dim + 1) * level;
size_t ksk_buffer_size = sizeof(uint64_t) * ksk_buffer_len;
uint64_t *ksk_buffer =
(uint64_t *)aligned_alloc(U64_ALIGNMENT, ksk_buffer_size);
void *ksk_gpu = cuda_malloc(ksk_buffer_size, gpu_idx);
CAPI_ASSERT_ERROR(
default_engine_discard_convert_lwe_keyswitch_key_to_lwe_keyswitch_key_mut_view_u64_raw_ptr_buffers(
get_levelled_engine(), ksk, ksk_buffer));
cuda_memcpy_async_to_gpu(ksk_gpu, ksk_buffer, ksk_buffer_size, stream,
gpu_idx);
// This is currently not 100% async as we have to free CPU memory after
// conversion
cuda_synchronize_device(gpu_idx);
free(ksk_buffer);
return ksk_gpu;
}
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size,
uint64_t out_stride, void *ct_gpu, size_t size,
uint32_t gpu_idx, void *stream) {
cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
size * sizeof(uint64_t), stream, gpu_idx);
cuda_synchronize_device(gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
}
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
@@ -139,14 +185,14 @@ void memref_bootstrap_lwe_cuda_u64(
uint32_t gpu_idx = 0;
void *stream = cuda_create_stream(gpu_idx);
// move bsk to gpu
void *fbsk_gpu = move_bsk_to_gpu(context, input_lwe_dim, poly_size, level,
glwe_dim, gpu_idx);
void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
level, glwe_dim, gpu_idx, stream);
// move input ciphertext into gpu
void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
ct0_size, ct0_stride, gpu_idx);
void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
ct0_size, ct0_stride, gpu_idx, stream);
// move output ciphertext into gpu
void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset,
out_size, out_stride, gpu_idx);
void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
out_size, out_stride, gpu_idx, stream);
// construct LUT GLWE ciphertext
uint64_t glwe_ct_len = poly_size * (glwe_dim + 1);
uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
@@ -179,8 +225,8 @@ void memref_bootstrap_lwe_cuda_u64(
fbsk_gpu, input_lwe_dim, poly_size, base_log, level, num_samples,
num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
// copy output ciphertext back to cpu
move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride,
out_gpu, out_size, gpu_idx);
memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
out_stride, out_gpu, out_size, gpu_idx, stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(fbsk_gpu, gpu_idx);

View File

@@ -0,0 +1,31 @@
// RUN: concretecompiler --passes concrete-to-bconcrete --action=dump-bconcrete --use-gpu %s 2>&1| FileCheck %s
//CHECK: func.func @main(%arg0: tensor<1025xi64>) -> tensor<1025xi64> {
//CHECK: %c1_i32 = arith.constant 1 : i32
//CHECK: %c8_i32 = arith.constant 8 : i32
//CHECK: %c2_i32 = arith.constant 2 : i32
//CHECK: %c1024_i32 = arith.constant 1024 : i32
//CHECK: %c575_i32 = arith.constant 575 : i32
//CHECK: %cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
//CHECK: %c5_i32 = arith.constant 5 : i32
//CHECK: %c2_i32_0 = arith.constant 2 : i32
//CHECK: %c575_i32_1 = arith.constant 575 : i32
//CHECK: %c1024_i32_2 = arith.constant 1024 : i32
//CHECK: %0 = "BConcrete.keyswitch_lwe_gpu_buffer"(%arg0, %c5_i32, %c2_i32_0, %c1024_i32_2, %c575_i32_1) : (tensor<1025xi64>, i32, i32, i32, i32) -> tensor<576xi64>
//CHECK: %1 = "BConcrete.bootstrap_lwe_gpu_buffer"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (tensor<576xi64>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> tensor<1025xi64>
//CHECK: return %1 : tensor<1025xi64>
//CHECK: }
func.func @main(%arg0: !Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<1024,2> {
%c1_i32 = arith.constant 1 : i32
%c8_i32 = arith.constant 8 : i32
%c2_i32 = arith.constant 2 : i32
%c1024_i32 = arith.constant 1024 : i32
%c575_i32 = arith.constant 575 : i32
%cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
%0 = "Concrete.keyswitch_lwe"(%arg0) {baseLog = 2 : i32, level = 5 : i32} : (!Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<575,2>
%1 = "Concrete.bootstrap_lwe"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (!Concrete.lwe_ciphertext<575,2>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> !Concrete.lwe_ciphertext<1024,2>
return %1 : !Concrete.lwe_ciphertext<1024,2>
}