diff --git a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td index 5b3f720ef..bb2217e80 100644 --- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td +++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td @@ -170,4 +170,18 @@ def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer"> let results = (outs 1DTensorOf<[I64]>:$result); } +// This is a different op in BConcrete just because of the way we are lowering to CAPI +// When the CAPI lowering is detached from bufferization, we can remove this op, and lower +// to the appropriate CAPI (gpu or cpu) depending on the useGPU compilation option +def BConcrete_KeySwitchLweGPUBufferOp : BConcrete_Op<"keyswitch_lwe_gpu_buffer"> { + let arguments = (ins + 1DTensorOf<[I64]>:$ciphertext, + I32:$level, + I32:$baseLog, + I32:$lwe_dim_in, + I32:$lwe_dim_out + ); + let results = (outs 1DTensorOf<[I64]>:$result); +} + #endif diff --git a/compiler/include/concretelang/Runtime/wrappers.h b/compiler/include/concretelang/Runtime/wrappers.h index 2650b2fed..720640015 100644 --- a/compiler/include/concretelang/Runtime/wrappers.h +++ b/compiler/include/concretelang/Runtime/wrappers.h @@ -129,8 +129,7 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned, /// \brief Run bootstrapping on GPU. /// /// It handles memory copy of the different arguments from CPU to GPU, and -/// freeing memory, except for the bootstrapping key, which should already be in -/// GPU. +/// freeing memory. /// /// \param out_allocated /// \param out_aligned @@ -164,7 +163,35 @@ void memref_bootstrap_lwe_cuda_u64( uint32_t base_log, uint32_t glwe_dim, uint32_t precision, mlir::concretelang::RuntimeContext *context); -/// \brief Copy ciphertext from CPU to GPU using a single stream. +/// \brief Run Keyswitch on GPU. +/// +/// It handles memory copy of the different arguments from CPU to GPU, and +/// freeing memory. +/// +/// \param out_allocated +/// \param out_aligned +/// \param out_offset +/// \param out_size +/// \param out_stride +/// \param ct0_allocated +/// \param ct0_aligned +/// \param ct0_offset +/// \param ct0_size +/// \param ct0_stride +/// \param level +/// \param base_log +/// \param input_lwe_dim LWE input dimension +/// \param output_lwe_dim LWE output dimension +/// \param context +void memref_keyswitch_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, + uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, + uint64_t ct0_stride, uint32_t level, uint32_t base_log, + uint32_t input_lwe_dim, uint32_t output_lwe_dim, + mlir::concretelang::RuntimeContext *context); + +/// \brief Copy ciphertext from CPU to GPU. /// /// It handles memory allocation on GPU. /// @@ -174,12 +201,14 @@ void memref_bootstrap_lwe_cuda_u64( /// \param ct_size /// \param ct_stride /// \param gpu_idx index of the GPU to use +/// \param stream cuda stream to use for the copy /// \return void* pointer to the GPU ciphertext -void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, - uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride, - uint32_t gpu_idx); +void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, + uint64_t ct_offset, uint64_t ct_size, + uint64_t ct_stride, uint32_t gpu_idx, + void *stream); -/// \brief Copy ciphertext from GPU to CPU using a single stream. +/// \brief Copy ciphertext from GPU to CPU. /// /// Memory on GPU won't be freed after the copy. /// @@ -191,11 +220,13 @@ void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, /// \param ct_gpu /// \param size /// \param gpu_idx index of the GPU to use -void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, - uint64_t out_offset, uint64_t out_size, uint64_t out_stride, - void *ct_gpu, size_t size, uint32_t gpu_idx); +/// \param stream cuda stream to use for the copy +void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, + uint64_t out_offset, uint64_t out_size, + uint64_t out_stride, void *ct_gpu, size_t size, + uint32_t gpu_idx, void *stream); -/// \brief Copy bootstrapping key from CPU to GPU using a single stream. +/// \brief Copy bootstrapping key from CPU to GPU. /// /// It handles memory allocation on GPU, as well as conversion to the Fourier /// domain. @@ -206,10 +237,28 @@ void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, /// \param level /// \param glwe_dim /// \param gpu_idx index of the GPU to use +/// \param stream cuda stream to use for the copy /// \return void* pointer to the GPU bsk -void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, - uint32_t input_lwe_dim, uint32_t poly_size, - uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx); +void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, + uint32_t input_lwe_dim, uint32_t poly_size, + uint32_t level, uint32_t glwe_dim, + uint32_t gpu_idx, void *stream); + +/// \brief Copy keyswitching key from CPU to GPU. +/// +/// It handles memory allocation on GPU. +/// +/// \param context +/// \param level +/// \param input_lwe_dim +/// \param output_lwe_dim +/// \param gpu_idx index of the GPU to use +/// \param stream cuda stream to use for the copy +/// \return void* pointer to the GPU ksk +void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context, + uint32_t level, uint32_t input_lwe_dim, + uint32_t output_lwe_dim, uint32_t gpu_idx, + void *stream); /// \brief Free gpu memory. /// diff --git a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp index 0e467be69..a0d580165 100644 --- a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp +++ b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp @@ -201,6 +201,50 @@ struct LowToBConcrete : public mlir::OpRewritePattern { }; }; +struct KeySwitchToGPU : public mlir::OpRewritePattern< + mlir::concretelang::Concrete::KeySwitchLweOp> { + KeySwitchToGPU(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1) + : ::mlir::OpRewritePattern( + context, benefit) {} + + ::mlir::LogicalResult + matchAndRewrite(mlir::concretelang::Concrete::KeySwitchLweOp keySwitchOp, + ::mlir::PatternRewriter &rewriter) const override { + ConcreteToBConcreteTypeConverter converter; + + mlir::Value levelCst = rewriter.create( + keySwitchOp.getLoc(), keySwitchOp.level(), 32); + mlir::Value baseLogCst = rewriter.create( + keySwitchOp.getLoc(), keySwitchOp.baseLog(), 32); + + // construct operands for in/out dimensions + mlir::concretelang::Concrete::LweCiphertextType outType = + keySwitchOp.getType(); + auto outDim = outType.getDimension(); + mlir::Value outDimCst = rewriter.create( + keySwitchOp.getLoc(), outDim, 32); + auto inputType = + keySwitchOp.ciphertext() + .getType() + .cast(); + auto inputDim = inputType.getDimension(); + mlir::Value inputDimCst = rewriter.create( + keySwitchOp.getLoc(), inputDim, 32); + + mlir::Operation *bKeySwitchGPUOp = rewriter.replaceOpWithNewOp< + mlir::concretelang::BConcrete::KeySwitchLweGPUBufferOp>( + keySwitchOp, outType, keySwitchOp.ciphertext(), levelCst, baseLogCst, + inputDimCst, outDimCst); + + mlir::concretelang::convertOperandAndResultTypes( + rewriter, bKeySwitchGPUOp, [&](mlir::MLIRContext *, mlir::Type t) { + return converter.convertType(t); + }); + + return ::mlir::success(); + }; +}; + struct AddPlaintextLweCiphertextOpPattern : public mlir::OpRewritePattern { AddPlaintextLweCiphertextOpPattern(::mlir::MLIRContext *context, @@ -872,23 +916,24 @@ void ConcreteToBConcretePass::runOnOperation() { LowToBConcrete, - LowToBConcrete, LowToBConcrete>(&getContext()); if (this->useGPU) { - patterns.insert>( - &getContext()); + patterns + .insert, + KeySwitchToGPU>(&getContext()); } else { patterns.insert< LowToBConcrete>( + mlir::concretelang::BConcrete::BootstrapLweBufferOp>, + LowToBConcrete>( &getContext()); } diff --git a/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp b/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp index 14ffc2e94..fcdd84b57 100644 --- a/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp @@ -76,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64"; char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64"; char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64"; char memref_await_future[] = "memref_await_future"; +char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64"; char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64"; char memref_expand_lut_in_trivial_glwe_ct_u64[] = "memref_expand_lut_in_trivial_glwe_ct_u64"; @@ -112,6 +113,11 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI( } else if (funcName == memref_keyswitch_lwe_u64) { funcType = mlir::FunctionType::get( rewriter.getContext(), {memref1DType, memref1DType, contextType}, {}); + } else if (funcName == memref_keyswitch_lwe_cuda_u64) { + funcType = mlir::FunctionType::get(rewriter.getContext(), + {memref1DType, memref1DType, i32Type, + i32Type, i32Type, i32Type, contextType}, + {}); } else if (funcName == memref_bootstrap_lwe_u64) { funcType = mlir::FunctionType::get(rewriter.getContext(), {memref1DType, memref1DType, @@ -482,6 +488,10 @@ void mlir::concretelang::BConcrete:: BufferizableWithCallOpInterface>( *ctx); + BConcrete::KeySwitchLweGPUBufferOp::attachInterface< + BufferizableWithCallOpInterface>( + *ctx); BConcrete::BootstrapLweGPUBufferOp::attachInterface< BufferizableWithCallOpInterface>( diff --git a/compiler/lib/Runtime/wrappers.cpp b/compiler/lib/Runtime/wrappers.cpp index 4ccf8233f..98f05c3ed 100644 --- a/compiler/lib/Runtime/wrappers.cpp +++ b/compiler/lib/Runtime/wrappers.cpp @@ -65,32 +65,57 @@ typedef struct double2 { // From concrete-cuda #include "bootstrap.h" #include "device.h" +#include "keyswitch.h" void memref_keyswitch_lwe_cuda_u64( uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, - uint64_t ct0_stride, mlir::concretelang::RuntimeContext *context) { - // TODO: GPU implementation + uint64_t ct0_stride, uint32_t level, uint32_t base_log, + uint32_t input_lwe_dim, uint32_t output_lwe_dim, + mlir::concretelang::RuntimeContext *context) { + // we currently just use the first GPU, but this should be decided + // dynamically, or during compilation, in the future + uint32_t gpu_idx = 0; + uint32_t num_samples = 1; + void *stream = cuda_create_stream(gpu_idx); + // move input ciphertext into gpu + void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset, + ct0_size, ct0_stride, gpu_idx, stream); + // move output ciphertext into gpu + void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset, + out_size, out_stride, gpu_idx, stream); + void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim, + output_lwe_dim, gpu_idx, stream); + cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu, + input_lwe_dim, output_lwe_dim, + base_log, level, num_samples); + // copy output ciphertext back to cpu + memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, + out_stride, out_gpu, out_size, gpu_idx, stream); + cuda_synchronize_device(gpu_idx); + // free memory that we allocated on gpu + cuda_drop(ct0_gpu, gpu_idx); + cuda_drop(out_gpu, gpu_idx); + cuda_drop(ksk_gpu, gpu_idx); + cuda_destroy_stream(stream, gpu_idx); } -void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, - uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride, - uint32_t gpu_idx) { - void *stream = cuda_create_stream(gpu_idx); +void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, + uint64_t ct_offset, uint64_t ct_size, + uint64_t ct_stride, uint32_t gpu_idx, + void *stream) { size_t buf_size = ct_size * sizeof(uint64_t); void *ct_gpu = cuda_malloc(buf_size, gpu_idx); cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream, gpu_idx); - cuda_synchronize_device(gpu_idx); - cuda_destroy_stream(stream, gpu_idx); return ct_gpu; } -void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, - uint32_t input_lwe_dim, uint32_t poly_size, - uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx = 0) { - void *stream = cuda_create_stream(gpu_idx); +void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, + uint32_t input_lwe_dim, uint32_t poly_size, + uint32_t level, uint32_t glwe_dim, + uint32_t gpu_idx, void *stream) { LweBootstrapKey64 *bsk = get_bootstrap_key_u64(context); size_t bsk_buffer_len = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * poly_size * level; @@ -105,20 +130,41 @@ void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, cuda_initialize_twiddles(poly_size, gpu_idx); cuda_convert_lwe_bootstrap_key_64(fbsk_gpu, bsk_buffer, stream, gpu_idx, input_lwe_dim, glwe_dim, level, poly_size); + // This is currently not 100% async as we have to free CPU memory after + // conversion cuda_synchronize_device(gpu_idx); - cuda_destroy_stream(stream, gpu_idx); free(bsk_buffer); return fbsk_gpu; } -void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, - uint64_t out_offset, uint64_t out_size, uint64_t out_stride, - void *ct_gpu, size_t size, uint32_t gpu_idx) { - void *stream = cuda_create_stream(gpu_idx); +void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context, + uint32_t level, uint32_t input_lwe_dim, + uint32_t output_lwe_dim, uint32_t gpu_idx, + void *stream) { + LweKeyswitchKey64 *ksk = get_keyswitch_key_u64(context); + size_t ksk_buffer_len = input_lwe_dim * (output_lwe_dim + 1) * level; + size_t ksk_buffer_size = sizeof(uint64_t) * ksk_buffer_len; + uint64_t *ksk_buffer = + (uint64_t *)aligned_alloc(U64_ALIGNMENT, ksk_buffer_size); + void *ksk_gpu = cuda_malloc(ksk_buffer_size, gpu_idx); + CAPI_ASSERT_ERROR( + default_engine_discard_convert_lwe_keyswitch_key_to_lwe_keyswitch_key_mut_view_u64_raw_ptr_buffers( + get_levelled_engine(), ksk, ksk_buffer)); + cuda_memcpy_async_to_gpu(ksk_gpu, ksk_buffer, ksk_buffer_size, stream, + gpu_idx); + // This is currently not 100% async as we have to free CPU memory after + // conversion + cuda_synchronize_device(gpu_idx); + free(ksk_buffer); + return ksk_gpu; +} + +void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, + uint64_t out_offset, uint64_t out_size, + uint64_t out_stride, void *ct_gpu, size_t size, + uint32_t gpu_idx, void *stream) { cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu, size * sizeof(uint64_t), stream, gpu_idx); - cuda_synchronize_device(gpu_idx); - cuda_destroy_stream(stream, gpu_idx); } void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) { @@ -139,14 +185,14 @@ void memref_bootstrap_lwe_cuda_u64( uint32_t gpu_idx = 0; void *stream = cuda_create_stream(gpu_idx); // move bsk to gpu - void *fbsk_gpu = move_bsk_to_gpu(context, input_lwe_dim, poly_size, level, - glwe_dim, gpu_idx); + void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size, + level, glwe_dim, gpu_idx, stream); // move input ciphertext into gpu - void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset, - ct0_size, ct0_stride, gpu_idx); + void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset, + ct0_size, ct0_stride, gpu_idx, stream); // move output ciphertext into gpu - void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset, - out_size, out_stride, gpu_idx); + void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset, + out_size, out_stride, gpu_idx, stream); // construct LUT GLWE ciphertext uint64_t glwe_ct_len = poly_size * (glwe_dim + 1); uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t); @@ -179,8 +225,8 @@ void memref_bootstrap_lwe_cuda_u64( fbsk_gpu, input_lwe_dim, poly_size, base_log, level, num_samples, num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx)); // copy output ciphertext back to cpu - move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride, - out_gpu, out_size, gpu_idx); + memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, + out_stride, out_gpu, out_size, gpu_idx, stream); cuda_synchronize_device(gpu_idx); // free memory that we allocated on gpu cuda_drop(fbsk_gpu, gpu_idx); diff --git a/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir b/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir new file mode 100644 index 000000000..42622555c --- /dev/null +++ b/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir @@ -0,0 +1,31 @@ +// RUN: concretecompiler --passes concrete-to-bconcrete --action=dump-bconcrete --use-gpu %s 2>&1| FileCheck %s + + +//CHECK: func.func @main(%arg0: tensor<1025xi64>) -> tensor<1025xi64> { +//CHECK: %c1_i32 = arith.constant 1 : i32 +//CHECK: %c8_i32 = arith.constant 8 : i32 +//CHECK: %c2_i32 = arith.constant 2 : i32 +//CHECK: %c1024_i32 = arith.constant 1024 : i32 +//CHECK: %c575_i32 = arith.constant 575 : i32 +//CHECK: %cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64> +//CHECK: %c5_i32 = arith.constant 5 : i32 +//CHECK: %c2_i32_0 = arith.constant 2 : i32 +//CHECK: %c575_i32_1 = arith.constant 575 : i32 +//CHECK: %c1024_i32_2 = arith.constant 1024 : i32 +//CHECK: %0 = "BConcrete.keyswitch_lwe_gpu_buffer"(%arg0, %c5_i32, %c2_i32_0, %c1024_i32_2, %c575_i32_1) : (tensor<1025xi64>, i32, i32, i32, i32) -> tensor<576xi64> +//CHECK: %1 = "BConcrete.bootstrap_lwe_gpu_buffer"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (tensor<576xi64>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> tensor<1025xi64> +//CHECK: return %1 : tensor<1025xi64> +//CHECK: } +func.func @main(%arg0: !Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<1024,2> { + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %c2_i32 = arith.constant 2 : i32 + %c1024_i32 = arith.constant 1024 : i32 + %c575_i32 = arith.constant 575 : i32 + %cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64> + %0 = "Concrete.keyswitch_lwe"(%arg0) {baseLog = 2 : i32, level = 5 : i32} : (!Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<575,2> + %1 = "Concrete.bootstrap_lwe"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (!Concrete.lwe_ciphertext<575,2>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> !Concrete.lwe_ciphertext<1024,2> + return %1 : !Concrete.lwe_ciphertext<1024,2> +} + +