diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt index fac7be971..c25367986 100644 --- a/compiler/CMakeLists.txt +++ b/compiler/CMakeLists.txt @@ -55,7 +55,19 @@ include_directories(${CONCRETE_FFI_RELEASE}) add_library(Concrete STATIC IMPORTED) set_target_properties(Concrete PROPERTIES IMPORTED_LOCATION ${CONCRETE_FFI_RELEASE}/libconcrete_core_ffi.a) -# ------------------------------------------------------------------------------- +#-------------------------------------------------------------------------------- +# Concrete Cuda Configuration +#-------------------------------------------------------------------------------- +option(CONCRETELANG_CUDA_SUPPORT "Support Concrete CUDA Execution." OFF) +if(CONCRETELANG_CUDA_SUPPORT) + message(STATUS "Building with Concrete CUDA execution support") + include_directories(${CONCRETE_CORE_PATH}/concrete-cuda/cuda/include) + add_library(ConcreteCUDA STATIC IMPORTED) + set_target_properties(ConcreteCUDA PROPERTIES IMPORTED_LOCATION ${CONCRETE_CORE_PATH}/concrete-cuda/cuda/build/src/libconcrete_cuda.a ) + add_compile_options(-DCONCRETELANG_CUDA_SUPPORT) +endif() + +#-------------------------------------------------------------------------------- # Python Configuration # ------------------------------------------------------------------------------- option(CONCRETELANG_BINDINGS_PYTHON_ENABLED "Enables ConcreteLang Python bindings." ON) diff --git a/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h b/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h new file mode 100644 index 000000000..0b28a4692 --- /dev/null +++ b/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h @@ -0,0 +1,18 @@ +// Part of the Concrete Compiler Project, under the BSD3 License with Zama +// Exceptions. See +// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt +// for license information. + +#ifndef ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_ +#define ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_ + +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace concretelang { +/// Create a pass to convert `BConcrete` dialect to CAPI calls. +std::unique_ptr> createConvertBConcreteToCAPIPass(); +} // namespace concretelang +} // namespace mlir + +#endif diff --git a/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h b/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h new file mode 100644 index 000000000..cafb2ae02 --- /dev/null +++ b/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h @@ -0,0 +1,18 @@ +// Part of the Concrete Compiler Project, under the BSD3 License with Zama +// Exceptions. See +// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt +// for license information. + +#ifndef ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_ +#define ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_ + +#include "mlir/Pass/Pass.h" + +namespace mlir { +namespace concretelang { +/// Create a pass to convert `Concrete` operations to GPU. +std::unique_ptr> createConvertConcreteToGPUPass(); +} // namespace concretelang +} // namespace mlir + +#endif diff --git a/compiler/include/concretelang/Conversion/Passes.h b/compiler/include/concretelang/Conversion/Passes.h index 0ab381ae3..a0de4a566 100644 --- a/compiler/include/concretelang/Conversion/Passes.h +++ b/compiler/include/concretelang/Conversion/Passes.h @@ -13,7 +13,9 @@ #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" +#include "concretelang/Conversion/BConcreteToCAPI/Pass.h" #include "concretelang/Conversion/ConcreteToBConcrete/Pass.h" +#include "concretelang/Conversion/ConcreteToGPU/Pass.h" #include "concretelang/Conversion/FHETensorOpsToLinalg/Pass.h" #include "concretelang/Conversion/FHEToTFHE/Pass.h" #include "concretelang/Conversion/LinalgExtras/Passes.h" diff --git a/compiler/include/concretelang/Conversion/Passes.td b/compiler/include/concretelang/Conversion/Passes.td index 1cfaf3183..ebc3c930d 100644 --- a/compiler/include/concretelang/Conversion/Passes.td +++ b/compiler/include/concretelang/Conversion/Passes.td @@ -47,6 +47,20 @@ def ConcreteToBConcrete : Pass<"concrete-to-bconcrete", "mlir::ModuleOp"> { let dependentDialects = ["mlir::linalg::LinalgDialect", "mlir::concretelang::Concrete::ConcreteDialect", "mlir::concretelang::BConcrete::BConcreteDialect"]; } +def BConcreteToCAPI : Pass<"bconcrete-to-capi", "mlir::ModuleOp"> { + let summary = "Lowers operations from the BConcrete dialect to CAPI calls"; + let description = [{ Lowers operations from the BConcrete dialect to CAPI calls }]; + let constructor = "mlir::concretelang::createConvertBConcreteToCAPIPass()"; + let dependentDialects = ["mlir::concretelang::BConcrete::BConcreteDialect"]; +} + +def ConcreteToGPU : Pass<"concrete-to-gpu", "mlir::ModuleOp"> { + let summary = "Transforms operations in the Concrete dialect to GPU"; + let description = [{ Transforms operations in the Concrete dialect to GPU }]; + let constructor = "mlir::concretelang::createConvertConcreteToGPUPass()"; + let dependentDialects = ["mlir::concretelang::Concrete::ConcreteDialect"]; +} + def MLIRLowerableDialectsToLLVM : Pass<"mlir-lowerable-dialects-to-llvm", "mlir::ModuleOp"> { let summary = "Lowers operations from MLIR lowerable dialects to LLVM"; let constructor = "mlir::concretelang::createConvertMLIRLowerableDialectsToLLVMPass()"; diff --git a/compiler/include/concretelang/Conversion/Tools.h b/compiler/include/concretelang/Conversion/Tools.h index 46ab11d98..41d841151 100644 --- a/compiler/include/concretelang/Conversion/Tools.h +++ b/compiler/include/concretelang/Conversion/Tools.h @@ -9,3 +9,9 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op, mlir::OpBuilder &rewriter, llvm::StringRef funcName, mlir::FunctionType funcType); + +/// \brief Returns the value of the context argument from the enclosing func +/// +/// \param op initial operation to start the search from +/// \return mlir::Value the context value +mlir::Value getContextArgument(mlir::Operation *op); diff --git a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h index 06cc85d02..568843db0 100644 --- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h +++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h @@ -6,6 +6,7 @@ #ifndef ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H #define ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H +#include #include #include #include diff --git a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td index 5ce17e652..d18c226c4 100644 --- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td +++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td @@ -5,6 +5,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/IR/BuiltinTypes.td" include "mlir/Dialect/MemRef/IR/MemRefBase.td" +include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.td" include "concretelang/Dialect/Concrete/IR/ConcreteTypes.td" @@ -157,4 +158,27 @@ def BConcrete_AwaitFutureOp : let results = (outs 1DTensorOf<[I64]>:$result); } +def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer"> { + let arguments = (ins + 1DTensorOf<[I64]>:$input_ciphertext, + 1DTensorOf<[I64]>:$table, + I32:$inputLweDim, + I32:$polySize, + I32:$level, + I32:$baseLog, + LLVM_PointerTo:$bsk + ); + let results = (outs 1DTensorOf<[I64]>:$result); +} + +def BConcrete_MoveBskToGPUOp : BConcrete_Op<"move_bsk_to_gpu"> { + let arguments = (ins); + let results = (outs LLVM_PointerTo:$bsk); +} + +def BConcrete_FreeBskFromGPUOp : BConcrete_Op<"free_bsk_from_gpu"> { + let arguments = (ins LLVM_PointerTo:$bsk); + let results = (outs); +} + #endif diff --git a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td index 46d90af16..0b72e042b 100644 --- a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td +++ b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td @@ -52,7 +52,7 @@ def Concrete_NegateLweCiphertextOp : Concrete_Op<"negate_lwe_ciphertext"> { let results = (outs Concrete_LweCiphertextType:$result); } -def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table"> { +def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table", [NoSideEffect]> { let summary = "Creates a GLWE ciphertext which is the trivial encrytion of a the input table interpreted as a polynomial (to use later in a bootstrap)"; let arguments = (ins 1DTensorOf<[I64]>:$table); @@ -71,6 +71,35 @@ def Concrete_BootstrapLweOp : Concrete_Op<"bootstrap_lwe"> { let results = (outs Concrete_LweCiphertextType:$result); } +def Concrete_BootstrapLweGPUOp : Concrete_Op<"bootstrap_lwe_gpu"> { + let summary = "Bootstrap an LWE ciphertext in GPU using a lookup table"; + + let arguments = (ins + Concrete_LweCiphertextType:$input_ciphertext, + 1DTensorOf<[I64]>:$table, + I32:$inputLweDim, + I32:$polySize, + I32:$level, + I32:$baseLog, + Concrete_GPUBsk:$bsk + ); + let results = (outs Concrete_LweCiphertextType:$result); +} + +def Concrete_MoveBskToGPUOp : Concrete_Op<"move_bsk_to_gpu"> { + let summary = "Move bsk to GPU"; + + let arguments = (ins); + let results = (outs Concrete_GPUBsk:$bsk); +} + +def Concrete_FreeBskFromGPUOp : Concrete_Op<"free_bsk_from_gpu"> { + let summary = "Free bsk memory from GPU"; + + let arguments = (ins Concrete_GPUBsk:$bsk); + let results = (outs); +} + def Concrete_KeySwitchLweOp : Concrete_Op<"keyswitch_lwe"> { let summary = "Keyswitches a LWE ciphertext"; diff --git a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td index 12c64fefb..5c597b527 100644 --- a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td +++ b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td @@ -93,4 +93,14 @@ def Concrete_Context : Concrete_Type<"Context"> { }]; } +def Concrete_GPUBsk : Concrete_Type<"GPUBsk"> { + let mnemonic = "gpu_bsk"; + + let summary = "A bsk in GPU"; + + let description = [{ + A bootstrapping key in GPU memory + }]; +} + #endif diff --git a/compiler/include/concretelang/Runtime/wrappers.h b/compiler/include/concretelang/Runtime/wrappers.h index a9a85cf9a..cdaea01d8 100644 --- a/compiler/include/concretelang/Runtime/wrappers.h +++ b/compiler/include/concretelang/Runtime/wrappers.h @@ -105,6 +105,87 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned, uint64_t src_stride, uint64_t *dst_allocated, uint64_t *dst_aligned, uint64_t dst_offset, uint64_t dst_size, uint64_t dst_stride); -} +/// \brief Run bootstrapping on GPU. +/// +/// It handles memory copy of the different arguments from CPU to GPU, and +/// freeing memory, except for the bootstrapping key, which should already be in +/// GPU. +/// +/// \param out_allocated +/// \param out_aligned +/// \param out_offset +/// \param out_size +/// \param out_stride +/// \param ct0_allocated +/// \param ct0_aligned +/// \param ct0_offset +/// \param ct0_size +/// \param ct0_stride +/// \param tlu_allocated +/// \param tlu_aligned +/// \param tlu_offset +/// \param tlu_size +/// \param tlu_stride +/// \param input_lwe_dim LWE input dimension +/// \param poly_size polynomial size +/// \param level level +/// \param base_log base log +/// \param bsk pointer to bsk on GPU +void memref_bootstrap_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, + uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, + uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned, + uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride, + uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level, + uint32_t base_log, void *bsk); + +/// \brief Copy ciphertext from CPU to GPU using a single stream. +/// +/// It handles memory allocation on GPU. +/// +/// \param ct_allocated +/// \param ct_aligned +/// \param ct_offset +/// \param ct_size +/// \param ct_stride +/// \param gpu_idx index of the GPU to use +/// \return void* pointer to the GPU ciphertext +void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, + uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride, + uint32_t gpu_idx); + +/// \brief Copy ciphertext from GPU to CPU using a single stream. +/// +/// Memory on GPU won't be freed after the copy. +/// +/// \param out_allocated +/// \param out_aligned +/// \param out_offset +/// \param out_size +/// \param out_stride +/// \param ct_gpu +/// \param size +/// \param gpu_idx index of the GPU to use +void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, + uint64_t out_offset, uint64_t out_size, uint64_t out_stride, + void *ct_gpu, size_t size, uint32_t gpu_idx); + +/// \brief Copy bootstrapping key from CPU to GPU using a single stream. +/// +/// It handles memory allocation on GPU. +/// +/// \param context +/// \param gpu_idx index of the GPU to use +/// \return void* pointer to the GPU bsk +void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, + uint32_t gpu_idx); + +/// \brief Free gpu memory. +/// +/// \param gpu_ptr pointer to the GPU memory to free +/// \param gpu_idx index of the GPU to use +void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx); +} #endif diff --git a/compiler/include/concretelang/Support/CompilerEngine.h b/compiler/include/concretelang/Support/CompilerEngine.h index 541f062d7..e50c321e3 100644 --- a/compiler/include/concretelang/Support/CompilerEngine.h +++ b/compiler/include/concretelang/Support/CompilerEngine.h @@ -54,6 +54,8 @@ struct CompilationOptions { bool dataflowParallelize; bool asyncOffload; bool optimizeConcrete; + /// use GPU during execution by generating GPU operations if possible + bool useGPU; llvm::Optional> fhelinalgTileSizes; llvm::Optional clientParametersFuncName; @@ -64,7 +66,7 @@ struct CompilationOptions { : v0FHEConstraints(llvm::None), verifyDiagnostics(false), autoParallelize(false), loopParallelize(false), dataflowParallelize(false), asyncOffload(false), optimizeConcrete(true), - clientParametersFuncName(llvm::None), + useGPU(false), clientParametersFuncName(llvm::None), optimizerConfig(optimizer::DEFAULT_CONFIG){}; CompilationOptions(std::string funcname) : CompilationOptions() { diff --git a/compiler/include/concretelang/Support/Pipeline.h b/compiler/include/concretelang/Support/Pipeline.h index a5ef12ffb..68838e7ef 100644 --- a/compiler/include/concretelang/Support/Pipeline.h +++ b/compiler/include/concretelang/Support/Pipeline.h @@ -57,6 +57,10 @@ mlir::LogicalResult asyncOffload(mlir::MLIRContext &context, mlir::ModuleOp &module, std::function enablePass); +mlir::LogicalResult +transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module, + std::function enablePass); + mlir::LogicalResult lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module, std::function enablePass); diff --git a/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp new file mode 100644 index 000000000..8a94654cf --- /dev/null +++ b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp @@ -0,0 +1,119 @@ +// Part of the Concrete Compiler Project, under the BSD3 License with Zama +// Exceptions. See +// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt +// for license information. + +#include +#include +#include + +#include "concretelang/Conversion/Passes.h" +#include "concretelang/Conversion/Tools.h" +#include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.h" +#include "concretelang/Dialect/BConcrete/IR/BConcreteOps.h" + +char move_bsk_to_gpu[] = "move_bsk_to_gpu"; +char free_from_gpu[] = "free_from_gpu"; + +/// \brief Rewrites `BConcrete.move_bsk_to_gpu` into a CAPI call to +/// `move_bsk_to_gpu` +/// +/// Also insert the forward declaration of `move_bsk_to_gpu` +struct MoveBskOpPattern : public mlir::OpRewritePattern< + mlir::concretelang::BConcrete::MoveBskToGPUOp> { + MoveBskOpPattern(::mlir::MLIRContext *context, + mlir::PatternBenefit benefit = 1) + : ::mlir::OpRewritePattern( + context, benefit) {} + + ::mlir::LogicalResult + matchAndRewrite(mlir::concretelang::BConcrete::MoveBskToGPUOp moveBskOp, + ::mlir::PatternRewriter &rewriter) const override { + + auto ctx = getContextArgument(moveBskOp); + + mlir::SmallVector operands{ctx}; + + // Insert forward declaration of the function + auto contextType = + mlir::concretelang::Concrete::ContextType::get(rewriter.getContext()); + auto funcType = mlir::FunctionType::get( + rewriter.getContext(), {contextType}, + {mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type())}); + if (insertForwardDeclaration(moveBskOp, rewriter, move_bsk_to_gpu, funcType) + .failed()) { + return mlir::failure(); + } + + rewriter.replaceOpWithNewOp( + moveBskOp, move_bsk_to_gpu, moveBskOp.getResult().getType(), operands); + + return ::mlir::success(); + }; +}; + +/// \brief Rewrites `BConcrete.free_bsk_from_gpu` into a CAPI call to +/// `free_from_gpu` +/// +/// Also insert the forward declaration of `free_from_gpu` +struct FreeBskOpPattern : public mlir::OpRewritePattern< + mlir::concretelang::BConcrete::FreeBskFromGPUOp> { + FreeBskOpPattern(::mlir::MLIRContext *context, + mlir::PatternBenefit benefit = 1) + : ::mlir::OpRewritePattern< + mlir::concretelang::BConcrete::FreeBskFromGPUOp>(context, benefit) { + } + + ::mlir::LogicalResult + matchAndRewrite(mlir::concretelang::BConcrete::FreeBskFromGPUOp freeBskOp, + ::mlir::PatternRewriter &rewriter) const override { + + mlir::SmallVector operands{freeBskOp.bsk()}; + + // Insert forward declaration of the function + auto funcType = mlir::FunctionType::get( + rewriter.getContext(), + {mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type())}, {}); + if (insertForwardDeclaration(freeBskOp, rewriter, free_from_gpu, funcType) + .failed()) { + return mlir::failure(); + } + + rewriter.replaceOpWithNewOp( + freeBskOp, free_from_gpu, mlir::TypeRange({}), operands); + + return ::mlir::success(); + }; +}; + +namespace { +struct BConcreteToCAPIPass : public BConcreteToCAPIBase { + void runOnOperation() final; +}; +} // namespace + +void BConcreteToCAPIPass::runOnOperation() { + auto op = this->getOperation(); + + mlir::ConversionTarget target(getContext()); + mlir::RewritePatternSet patterns(&getContext()); + + target.addIllegalOp(); + target.addLegalDialect(); + + patterns.insert(&getContext()); + patterns.insert(&getContext()); + + // Apply conversion + if (mlir::applyPartialConversion(op, target, std::move(patterns)).failed()) { + this->signalPassFailure(); + } +} + +namespace mlir { +namespace concretelang { +std::unique_ptr> createConvertBConcreteToCAPIPass() { + return std::make_unique(); +} +} // namespace concretelang +} // namespace mlir diff --git a/compiler/lib/Conversion/BConcreteToCAPI/CMakeLists.txt b/compiler/lib/Conversion/BConcreteToCAPI/CMakeLists.txt new file mode 100644 index 000000000..f1728e48e --- /dev/null +++ b/compiler/lib/Conversion/BConcreteToCAPI/CMakeLists.txt @@ -0,0 +1,15 @@ +add_mlir_dialect_library(BConcreteToCAPI + BConcreteToCAPI.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/BConcrete + + DEPENDS + BConcreteDialect + mlir-headers + + LINK_LIBS PUBLIC + MLIRIR + MLIRTransforms) + +target_link_libraries(BConcreteToCAPI PUBLIC BConcreteDialect MLIRIR) diff --git a/compiler/lib/Conversion/CMakeLists.txt b/compiler/lib/Conversion/CMakeLists.txt index 428de75c1..135514075 100644 --- a/compiler/lib/Conversion/CMakeLists.txt +++ b/compiler/lib/Conversion/CMakeLists.txt @@ -3,6 +3,8 @@ add_subdirectory(TFHEGlobalParametrization) add_subdirectory(TFHEToConcrete) add_subdirectory(FHETensorOpsToLinalg) add_subdirectory(ConcreteToBConcrete) +add_subdirectory(ConcreteToGPU) +add_subdirectory(BConcreteToCAPI) add_subdirectory(MLIRLowerableDialectsToLLVM) add_subdirectory(LinalgExtras) diff --git a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp index 361196184..3a1a0d45c 100644 --- a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp +++ b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,10 @@ class ConcreteToBConcreteTypeConverter : public mlir::TypeConverter { public: ConcreteToBConcreteTypeConverter() { addConversion([](mlir::Type type) { return type; }); + addConversion([&](mlir::concretelang::Concrete::GPUBskType type) { + return mlir::LLVM::LLVMPointerType::get( + mlir::IntegerType::get(type.getContext(), 64)); + }); addConversion([&](mlir::concretelang::Concrete::PlaintextType type) { return mlir::IntegerType::get(type.getContext(), 64); }); @@ -160,28 +165,34 @@ struct LowToBConcrete : public mlir::OpRewritePattern { matchAndRewrite(ConcreteOp concreteOp, ::mlir::PatternRewriter &rewriter) const override { ConcreteToBConcreteTypeConverter converter; - mlir::concretelang::Concrete::LweCiphertextType resultTy = - ((mlir::Type)concreteOp->getResult(0).getType()) - .cast(); - auto newResultTy = - converter.convertType(resultTy).cast(); + mlir::TypeRange resultTyRange = concreteOp->getResultTypes(); llvm::ArrayRef<::mlir::NamedAttribute> attributes = concreteOp.getOperation()->getAttrs(); - auto crt = resultTy.getCrtDecomposition(); mlir::Operation *bConcreteOp; - if (crt.empty()) { - bConcreteOp = rewriter.replaceOpWithNewOp( - concreteOp, newResultTy, concreteOp.getOperation()->getOperands(), - attributes); + if (resultTyRange.size() == 1 && + resultTyRange.front() + .isa()) { + auto crt = resultTyRange.front() + .cast() + .getCrtDecomposition(); + if (crt.empty()) { + bConcreteOp = rewriter.replaceOpWithNewOp( + concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(), + attributes); + } else { + auto newAttributes = attributes.vec(); + newAttributes.push_back(rewriter.getNamedAttr( + "crtDecomposition", rewriter.getI64ArrayAttr(crt))); + bConcreteOp = rewriter.replaceOpWithNewOp( + concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(), + newAttributes); + } } else { - auto newAttributes = attributes.vec(); - newAttributes.push_back(rewriter.getNamedAttr( - "crtDecomposition", rewriter.getI64ArrayAttr(crt))); - bConcreteOp = rewriter.replaceOpWithNewOp( - concreteOp, newResultTy, concreteOp.getOperation()->getOperands(), - newAttributes); + bConcreteOp = rewriter.replaceOpWithNewOp( + concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(), + attributes); } mlir::concretelang::convertOperandAndResultTypes( @@ -906,7 +917,16 @@ void ConcreteToBConcretePass::runOnOperation() { mlir::concretelang::BConcrete::KeySwitchLweBufferOp>, LowToBConcrete, + mlir::concretelang::BConcrete::BootstrapLweBufferOp>, + LowToBConcrete, + LowToBConcrete, + LowToBConcrete, LowToBConcrete>(&getContext()); diff --git a/compiler/lib/Conversion/ConcreteToGPU/CMakeLists.txt b/compiler/lib/Conversion/ConcreteToGPU/CMakeLists.txt new file mode 100644 index 000000000..cecf498e0 --- /dev/null +++ b/compiler/lib/Conversion/ConcreteToGPU/CMakeLists.txt @@ -0,0 +1,16 @@ +add_mlir_dialect_library(ConcreteToGPU + ConcreteToGPU.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/Concrete + + DEPENDS + ConcreteDialect + mlir-headers + + LINK_LIBS PUBLIC + MLIRIR + MLIRTransforms + ) + +target_link_libraries(ConcreteToGPU PUBLIC ConcreteDialect MLIRIR) diff --git a/compiler/lib/Conversion/ConcreteToGPU/ConcreteToGPU.cpp b/compiler/lib/Conversion/ConcreteToGPU/ConcreteToGPU.cpp new file mode 100644 index 000000000..17d0c90ee --- /dev/null +++ b/compiler/lib/Conversion/ConcreteToGPU/ConcreteToGPU.cpp @@ -0,0 +1,108 @@ +// Part of the Concrete Compiler Project, under the BSD3 License with Zama +// Exceptions. See +// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt +// for license information. + +#include +#include + +#include "concretelang/Conversion/Passes.h" +#include "concretelang/Dialect/Concrete/IR/ConcreteDialect.h" +#include "concretelang/Dialect/Concrete/IR/ConcreteOps.h" +#include "concretelang/Dialect/Concrete/IR/ConcreteTypes.h" + +/// This rewrite pattern transforms any instance of `Concrete.bootstrap_lwe` +/// into `Concrete.bootstrap_lwe_gpu`. It also inserts operations to allocate +/// memory, copy bsk into GPU, and free memory after bootstrapping. +struct BstOpPattern : public mlir::OpRewritePattern< + mlir::concretelang::Concrete::BootstrapLweOp> { + BstOpPattern(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1) + : ::mlir::OpRewritePattern( + context, benefit) {} + + ::mlir::LogicalResult + matchAndRewrite(mlir::concretelang::Concrete::BootstrapLweOp bstOp, + ::mlir::PatternRewriter &rewriter) const override { + + auto baselog = bstOp.baseLog(); + auto level = bstOp.level(); + mlir::Value ct = bstOp.input_ciphertext(); + + auto ctType = + ct.getType().cast(); + auto inputLweDim = ctType.getDimension(); + + auto outType = bstOp.getResult() + .getType() + .cast(); + auto outputLweDim = outType.getDimension(); + + // copy bsk into GPU + mlir::Value bskGPU = + rewriter + .create( + bstOp.getLoc(), mlir::concretelang::Concrete::GPUBskType::get( + rewriter.getContext())) + .getResult(); + + mlir::Value inputLweDimCst = rewriter.create( + bstOp.getLoc(), inputLweDim, 32); + mlir::Value polySizeCst = rewriter.create( + bstOp.getLoc(), outputLweDim, 32); + mlir::Value levelCst = + rewriter.create(bstOp.getLoc(), level, 32); + mlir::Value baselogCst = rewriter.create( + bstOp.getLoc(), baselog, 32); + + mlir::Type tableType = + mlir::RankedTensorType::get({4}, rewriter.getI64Type()); + mlir::Value tableCst = rewriter.create( + bstOp.getLoc(), + mlir::DenseIntElementsAttr::get( + tableType, {llvm::APInt(64, 0), llvm::APInt(64, 0), + llvm::APInt(64, 0), llvm::APInt(64, 0)})); + + rewriter + .replaceOpWithNewOp( + bstOp, outType, ct, tableCst, inputLweDimCst, polySizeCst, levelCst, + baselogCst, bskGPU); + + // free bsk memory from GPU + rewriter.create( + bstOp.getLoc(), bskGPU); + + return ::mlir::success(); + }; +}; + +namespace { +struct ConcreteToGPUPass : public ConcreteToGPUBase { + void runOnOperation() final; +}; +} // namespace + +void ConcreteToGPUPass::runOnOperation() { + auto op = this->getOperation(); + + mlir::ConversionTarget target(getContext()); + mlir::RewritePatternSet patterns(&getContext()); + + target.addLegalDialect(); + target.addIllegalOp(); + + patterns.insert(&getContext()); + + // Apply conversion + if (mlir::applyPartialConversion(op, target, std::move(patterns)).failed()) { + this->signalPassFailure(); + } +} + +namespace mlir { +namespace concretelang { +std::unique_ptr> createConvertConcreteToGPUPass() { + return std::make_unique(); +} +} // namespace concretelang +} // namespace mlir diff --git a/compiler/lib/Conversion/Tools.cpp b/compiler/lib/Conversion/Tools.cpp index 6b0c0f5d9..c8dcaa67e 100644 --- a/compiler/lib/Conversion/Tools.cpp +++ b/compiler/lib/Conversion/Tools.cpp @@ -6,6 +6,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "concretelang/Conversion/Tools.h" +#include "concretelang/Dialect/Concrete/IR/ConcreteTypes.h" mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op, mlir::OpBuilder &rewriter, @@ -35,3 +36,27 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op, mlir::SymbolTable::lookupSymbolIn(module, funcName))); return mlir::success(); } + +/// Returns the value of the context argument from the enclosing func +mlir::Value getContextArgument(mlir::Operation *op) { + mlir::Block *block = op->getBlock(); + while (block != nullptr) { + if (llvm::isa(block->getParentOp())) { + + auto context = std::find_if( + block->getArguments().rbegin(), block->getArguments().rend(), + [](mlir::BlockArgument &arg) { + return arg.getType() + .isa(); + }); + + assert(context != block->getArguments().rend() && + "Cannot find the Concrete.context"); + + return *context; + } + block = block->getParentOp()->getBlock(); + } + assert("can't find a function that enclose the op"); + return nullptr; +} diff --git a/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp b/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp index fb829b78d..4ef763371 100644 --- a/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp @@ -7,6 +7,7 @@ #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -75,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64"; char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64"; char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64"; char memref_await_future[] = "memref_await_future"; +char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64"; char memref_expand_lut_in_trivial_glwe_ct_u64[] = "memref_expand_lut_in_trivial_glwe_ct_u64"; @@ -89,6 +91,9 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI( mlir::concretelang::RT::FutureType::get(rewriter.getIndexType()); auto contextType = mlir::concretelang::Concrete::ContextType::get(rewriter.getContext()); + auto i32Type = rewriter.getI32Type(); + auto i64PointerType = mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type()); + mlir::FunctionType funcType; if (funcName == memref_add_lwe_ciphertexts_u64) { @@ -124,6 +129,12 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI( funcType = mlir::FunctionType::get( rewriter.getContext(), {memref1DType, futureType, memref1DType, memref1DType}, {}); + } else if (funcName == memref_bootstrap_lwe_cuda_u64) { + funcType = mlir::FunctionType::get(rewriter.getContext(), + {memref1DType, memref1DType, + memref1DType, i32Type, i32Type, i32Type, + i32Type, i64PointerType}, + {}); } else if (funcName == memref_expand_lut_in_trivial_glwe_ct_u64) { funcType = mlir::FunctionType::get(rewriter.getContext(), { @@ -156,32 +167,6 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI( return insertForwardDeclaration(op, rewriter, funcName, funcType); } -/// Returns the value of the context argument from the enclosing func -mlir::Value getContextArgument(mlir::Operation *op) { - mlir::Block *block = op->getBlock(); - while (block != nullptr) { - if (llvm::isa(block->getParentOp())) { - block = &mlir::cast(block->getParentOp()) - .getBody() - .front(); - - auto context = - std::find_if(block->getArguments().rbegin(), - block->getArguments().rend(), [](BlockArgument &arg) { - return arg.getType() - .isa(); - }); - assert(context != block->getArguments().rend() && - "Cannot find the Concrete.context"); - - return *context; - } - block = block->getParentOp()->getBlock(); - } - assert("can't find a function that enclose the op"); - return nullptr; -}; - template void pushAdditionalArgs(Op op, mlir::SmallVector &operands, RewriterBase &rewriter); @@ -578,6 +563,10 @@ void mlir::concretelang::BConcrete:: BufferizableWithCallOpInterface>( *ctx); + BConcrete::BootstrapLweGPUBufferOp::attachInterface< + BufferizableWithCallOpInterface>( + *ctx); BConcrete::KeySwitchLweBufferOp::attachInterface< BufferizableWithCallOpInterface>(*ctx); diff --git a/compiler/lib/Runtime/CMakeLists.txt b/compiler/lib/Runtime/CMakeLists.txt index 04f992f0c..615dfd3c5 100644 --- a/compiler/lib/Runtime/CMakeLists.txt +++ b/compiler/lib/Runtime/CMakeLists.txt @@ -18,6 +18,16 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED) ) endif() +if(CONCRETELANG_CUDA_SUPPORT) + target_link_libraries( + ConcretelangRuntime + PRIVATE + ConcreteCUDA + -L/usr/local/cuda/lib64 + cudart + ) +endif() + if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") target_link_libraries(ConcretelangRuntime PUBLIC omp) else() diff --git a/compiler/lib/Runtime/wrappers.cpp b/compiler/lib/Runtime/wrappers.cpp index db7883039..24067960f 100644 --- a/compiler/lib/Runtime/wrappers.cpp +++ b/compiler/lib/Runtime/wrappers.cpp @@ -57,6 +57,122 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size, #include "concretelang/ClientLib/CRT.h" #include "concretelang/Runtime/wrappers.h" +#ifdef CONCRETELANG_CUDA_SUPPORT + +// We need to define the double2 struct from the CUDA backend header files +// This shouldn't be defined here, but included along with concrete-cuda header +// files +typedef struct double2 { + double x, y; +} double2; + +#include "bootstrap.h" +#include "device.h" + +void memref_keyswitch_lwe_cuda_u64(uint64_t *out_allocated, + uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size, uint64_t out_stride, + uint64_t *ct0_allocated, + uint64_t *ct0_aligned, uint64_t ct0_offset, + uint64_t ct0_size, uint64_t ct0_stride, + void *ksk_gpu) { + // TODO: GPU implementation +} + +void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned, + uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride, + uint32_t gpu_idx) { + void *stream = cuda_create_stream(gpu_idx); + void *ct_gpu = cuda_malloc(ct_size * sizeof(uint64_t), gpu_idx); + cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, + ct_size * sizeof(uint64_t), stream, gpu_idx); + cuda_synchronize_device(gpu_idx); + cuda_destroy_stream(stream, gpu_idx); + return ct_gpu; +} + +void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context, + uint32_t gpu_idx = 0) { + void *stream = cuda_create_stream(gpu_idx); + LweBootstrapKey_u64 *bsk = get_bootstrap_key_u64(context); + BufferView bskBuffer = bootstrap_buffer_lwe_u64(bsk); + void *bsk_gpu = cuda_malloc(bskBuffer.length, gpu_idx); + cuda_memcpy_async_to_gpu(bsk_gpu, (void *)bskBuffer.pointer, bskBuffer.length, + stream, gpu_idx); + cuda_synchronize_device(gpu_idx); + cuda_destroy_stream(stream, gpu_idx); + return bsk_gpu; +} + +void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned, + uint64_t out_offset, uint64_t out_size, uint64_t out_stride, + void *ct_gpu, size_t size, uint32_t gpu_idx) { + void *stream = cuda_create_stream(gpu_idx); + cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu, + size * sizeof(uint64_t), stream, gpu_idx); + cuda_synchronize_device(gpu_idx); + cuda_destroy_stream(stream, gpu_idx); +} + +void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) { + cuda_drop(gpu_ptr, gpu_idx); +} + +void memref_bootstrap_lwe_cuda_u64( + uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset, + uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated, + uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size, + uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned, + uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride, + uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level, + uint32_t base_log, void *bsk_gpu) { + + uint32_t gpu_idx = 0; + void *stream = cuda_create_stream(gpu_idx); + + // move input ciphertext into gpu + void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset, + ct0_size, ct0_stride, gpu_idx); + // move output ciphertext into gpu + void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset, + out_size, out_stride, gpu_idx); + // hardcoded values + uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0; + void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t)); + ((uint32_t *)test_vector_idxes)[0] = 0; + void *test_vector = malloc(poly_size * sizeof(uint64_t)); + for (size_t i = 0; i < poly_size; i++) { + ((uint64_t *)test_vector)[i] = (uint64_t)1 << 61; + } + // move test vector into gpu + void *test_vector_gpu = cuda_malloc(poly_size * sizeof(uint64_t), gpu_idx); + cuda_memcpy_async_to_gpu(test_vector_gpu, test_vector, + poly_size * sizeof(uint64_t), stream, gpu_idx); + // move test vector indexes into gpu + void *test_vector_idxes_gpu = + cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx); + cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes, + num_samples * sizeof(uint32_t), stream, gpu_idx); + // run gpu bootstrap + cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( + stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu, bsk_gpu, + input_lwe_dim, poly_size, base_log, level, num_samples, num_test_vectors, + lwe_idx, cuda_get_max_shared_memory(gpu_idx)); + // copy output ciphertext back to cpu + move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride, + out_gpu, out_size, gpu_idx); + cuda_synchronize_device(gpu_idx); + // free memory that we allocated on gpu + cuda_drop(ct0_gpu, gpu_idx); + cuda_drop(out_gpu, gpu_idx); + cuda_drop(test_vector_gpu, gpu_idx); + cuda_drop(test_vector_idxes_gpu, gpu_idx); + + cuda_destroy_stream(stream, gpu_idx); +} + +#endif + void memref_expand_lut_in_trivial_glwe_ct_u64( uint64_t *glwe_ct_allocated, uint64_t *glwe_ct_aligned, uint64_t glwe_ct_offset, uint64_t glwe_ct_size, uint64_t glwe_ct_stride, diff --git a/compiler/lib/Support/CompilerEngine.cpp b/compiler/lib/Support/CompilerEngine.cpp index 642676ec8..e67380f48 100644 --- a/compiler/lib/Support/CompilerEngine.cpp +++ b/compiler/lib/Support/CompilerEngine.cpp @@ -317,6 +317,14 @@ CompilerEngine::compile(llvm::SourceMgr &sm, Target target, OptionalLib lib) { return errorDiag("Optimizing Concrete failed"); } + // Transforming into GPU + if (this->compilerOptions.useGPU && + mlir::concretelang::pipeline::transformsConcreteToGPU(mlirContext, module, + this->enablePass) + .failed()) { + return errorDiag("Transforming Concrete to GPU failed"); + } + if (target == Target::CONCRETE) return std::move(res); diff --git a/compiler/lib/Support/Pipeline.cpp b/compiler/lib/Support/Pipeline.cpp index 076addbda..75abb4f8d 100644 --- a/compiler/lib/Support/Pipeline.cpp +++ b/compiler/lib/Support/Pipeline.cpp @@ -239,6 +239,16 @@ optimizeConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module, return pm.run(module.getOperation()); } +mlir::LogicalResult +transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module, + std::function enablePass) { + mlir::PassManager pm(&context); + pipelinePrinting("ConcreteToGPU", pm, context); + addPotentiallyNestedPass( + pm, mlir::concretelang::createConvertConcreteToGPUPass(), enablePass); + return pm.run(module.getOperation()); +} + mlir::LogicalResult lowerConcreteToBConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module, std::function enablePass, @@ -283,6 +293,8 @@ lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module, enablePass); addPotentiallyNestedPass(pm, mlir::concretelang::createAddRuntimeContext(), enablePass); + addPotentiallyNestedPass( + pm, mlir::concretelang::createConvertBConcreteToCAPIPass(), enablePass); return pm.run(module.getOperation()); } diff --git a/compiler/src/main.cpp b/compiler/src/main.cpp index 2bbdc9875..04da13418 100644 --- a/compiler/src/main.cpp +++ b/compiler/src/main.cpp @@ -98,6 +98,12 @@ llvm::cl::opt "dialects. (Enabled by default)"), llvm::cl::init(true)); +llvm::cl::opt + useGPU("use-gpu", + llvm::cl::desc("enable/disable generating concrete GPU " + "operations (Disabled by default)"), + llvm::cl::init(false)); + llvm::cl::list passes( "passes", llvm::cl::desc("Specify the passes to run (use only for compiler tests)"), @@ -283,6 +289,7 @@ cmdlineCompilationOptions() { options.loopParallelize = cmdline::loopParallelize; options.dataflowParallelize = cmdline::dataflowParallelize; options.optimizeConcrete = cmdline::optimizeConcrete; + options.useGPU = cmdline::useGPU; if (!cmdline::v0Constraint.empty()) { if (cmdline::v0Constraint.size() != 2) {