feat: support GPU (bootstrapping)

This commit is contained in:
youben11
2022-07-21 14:45:28 +01:00
committed by Ayoub Benaissa
parent a487b03699
commit d169a27fc0
26 changed files with 715 additions and 47 deletions

View File

@@ -55,7 +55,19 @@ include_directories(${CONCRETE_FFI_RELEASE})
add_library(Concrete STATIC IMPORTED)
set_target_properties(Concrete PROPERTIES IMPORTED_LOCATION ${CONCRETE_FFI_RELEASE}/libconcrete_core_ffi.a)
# -------------------------------------------------------------------------------
#--------------------------------------------------------------------------------
# Concrete Cuda Configuration
#--------------------------------------------------------------------------------
option(CONCRETELANG_CUDA_SUPPORT "Support Concrete CUDA Execution." OFF)
if(CONCRETELANG_CUDA_SUPPORT)
message(STATUS "Building with Concrete CUDA execution support")
include_directories(${CONCRETE_CORE_PATH}/concrete-cuda/cuda/include)
add_library(ConcreteCUDA STATIC IMPORTED)
set_target_properties(ConcreteCUDA PROPERTIES IMPORTED_LOCATION ${CONCRETE_CORE_PATH}/concrete-cuda/cuda/build/src/libconcrete_cuda.a )
add_compile_options(-DCONCRETELANG_CUDA_SUPPORT)
endif()
#--------------------------------------------------------------------------------
# Python Configuration
# -------------------------------------------------------------------------------
option(CONCRETELANG_BINDINGS_PYTHON_ENABLED "Enables ConcreteLang Python bindings." ON)

View File

@@ -0,0 +1,18 @@
// Part of the Concrete Compiler Project, under the BSD3 License with Zama
// Exceptions. See
// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
// for license information.
#ifndef ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_
#define ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_
#include "mlir/Pass/Pass.h"
namespace mlir {
namespace concretelang {
/// Create a pass to convert `BConcrete` dialect to CAPI calls.
std::unique_ptr<OperationPass<ModuleOp>> createConvertBConcreteToCAPIPass();
} // namespace concretelang
} // namespace mlir
#endif

View File

@@ -0,0 +1,18 @@
// Part of the Concrete Compiler Project, under the BSD3 License with Zama
// Exceptions. See
// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
// for license information.
#ifndef ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_
#define ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_
#include "mlir/Pass/Pass.h"
namespace mlir {
namespace concretelang {
/// Create a pass to convert `Concrete` operations to GPU.
std::unique_ptr<OperationPass<ModuleOp>> createConvertConcreteToGPUPass();
} // namespace concretelang
} // namespace mlir
#endif

View File

@@ -13,7 +13,9 @@
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "concretelang/Conversion/BConcreteToCAPI/Pass.h"
#include "concretelang/Conversion/ConcreteToBConcrete/Pass.h"
#include "concretelang/Conversion/ConcreteToGPU/Pass.h"
#include "concretelang/Conversion/FHETensorOpsToLinalg/Pass.h"
#include "concretelang/Conversion/FHEToTFHE/Pass.h"
#include "concretelang/Conversion/LinalgExtras/Passes.h"

View File

@@ -47,6 +47,20 @@ def ConcreteToBConcrete : Pass<"concrete-to-bconcrete", "mlir::ModuleOp"> {
let dependentDialects = ["mlir::linalg::LinalgDialect", "mlir::concretelang::Concrete::ConcreteDialect", "mlir::concretelang::BConcrete::BConcreteDialect"];
}
def BConcreteToCAPI : Pass<"bconcrete-to-capi", "mlir::ModuleOp"> {
let summary = "Lowers operations from the BConcrete dialect to CAPI calls";
let description = [{ Lowers operations from the BConcrete dialect to CAPI calls }];
let constructor = "mlir::concretelang::createConvertBConcreteToCAPIPass()";
let dependentDialects = ["mlir::concretelang::BConcrete::BConcreteDialect"];
}
def ConcreteToGPU : Pass<"concrete-to-gpu", "mlir::ModuleOp"> {
let summary = "Transforms operations in the Concrete dialect to GPU";
let description = [{ Transforms operations in the Concrete dialect to GPU }];
let constructor = "mlir::concretelang::createConvertConcreteToGPUPass()";
let dependentDialects = ["mlir::concretelang::Concrete::ConcreteDialect"];
}
def MLIRLowerableDialectsToLLVM : Pass<"mlir-lowerable-dialects-to-llvm", "mlir::ModuleOp"> {
let summary = "Lowers operations from MLIR lowerable dialects to LLVM";
let constructor = "mlir::concretelang::createConvertMLIRLowerableDialectsToLLVMPass()";

View File

@@ -9,3 +9,9 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
mlir::OpBuilder &rewriter,
llvm::StringRef funcName,
mlir::FunctionType funcType);
/// \brief Returns the value of the context argument from the enclosing func
///
/// \param op initial operation to start the search from
/// \return mlir::Value the context value
mlir::Value getContextArgument(mlir::Operation *op);

View File

@@ -6,6 +6,7 @@
#ifndef ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H
#define ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H
#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
#include <mlir/IR/Builders.h>
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/BuiltinTypes.h>

View File

@@ -5,6 +5,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Interfaces/ControlFlowInterfaces.td"
include "mlir/IR/BuiltinTypes.td"
include "mlir/Dialect/MemRef/IR/MemRefBase.td"
include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.td"
include "concretelang/Dialect/Concrete/IR/ConcreteTypes.td"
@@ -157,4 +158,27 @@ def BConcrete_AwaitFutureOp :
let results = (outs 1DTensorOf<[I64]>:$result);
}
def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer"> {
let arguments = (ins
1DTensorOf<[I64]>:$input_ciphertext,
1DTensorOf<[I64]>:$table,
I32:$inputLweDim,
I32:$polySize,
I32:$level,
I32:$baseLog,
LLVM_PointerTo<I64>:$bsk
);
let results = (outs 1DTensorOf<[I64]>:$result);
}
def BConcrete_MoveBskToGPUOp : BConcrete_Op<"move_bsk_to_gpu"> {
let arguments = (ins);
let results = (outs LLVM_PointerTo<I64>:$bsk);
}
def BConcrete_FreeBskFromGPUOp : BConcrete_Op<"free_bsk_from_gpu"> {
let arguments = (ins LLVM_PointerTo<I64>:$bsk);
let results = (outs);
}
#endif

View File

@@ -52,7 +52,7 @@ def Concrete_NegateLweCiphertextOp : Concrete_Op<"negate_lwe_ciphertext"> {
let results = (outs Concrete_LweCiphertextType:$result);
}
def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table"> {
def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table", [NoSideEffect]> {
let summary = "Creates a GLWE ciphertext which is the trivial encrytion of a the input table interpreted as a polynomial (to use later in a bootstrap)";
let arguments = (ins 1DTensorOf<[I64]>:$table);
@@ -71,6 +71,35 @@ def Concrete_BootstrapLweOp : Concrete_Op<"bootstrap_lwe"> {
let results = (outs Concrete_LweCiphertextType:$result);
}
def Concrete_BootstrapLweGPUOp : Concrete_Op<"bootstrap_lwe_gpu"> {
let summary = "Bootstrap an LWE ciphertext in GPU using a lookup table";
let arguments = (ins
Concrete_LweCiphertextType:$input_ciphertext,
1DTensorOf<[I64]>:$table,
I32:$inputLweDim,
I32:$polySize,
I32:$level,
I32:$baseLog,
Concrete_GPUBsk:$bsk
);
let results = (outs Concrete_LweCiphertextType:$result);
}
def Concrete_MoveBskToGPUOp : Concrete_Op<"move_bsk_to_gpu"> {
let summary = "Move bsk to GPU";
let arguments = (ins);
let results = (outs Concrete_GPUBsk:$bsk);
}
def Concrete_FreeBskFromGPUOp : Concrete_Op<"free_bsk_from_gpu"> {
let summary = "Free bsk memory from GPU";
let arguments = (ins Concrete_GPUBsk:$bsk);
let results = (outs);
}
def Concrete_KeySwitchLweOp : Concrete_Op<"keyswitch_lwe"> {
let summary = "Keyswitches a LWE ciphertext";

View File

@@ -93,4 +93,14 @@ def Concrete_Context : Concrete_Type<"Context"> {
}];
}
def Concrete_GPUBsk : Concrete_Type<"GPUBsk"> {
let mnemonic = "gpu_bsk";
let summary = "A bsk in GPU";
let description = [{
A bootstrapping key in GPU memory
}];
}
#endif

View File

@@ -105,6 +105,87 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
uint64_t src_stride, uint64_t *dst_allocated,
uint64_t *dst_aligned, uint64_t dst_offset,
uint64_t dst_size, uint64_t dst_stride);
}
/// \brief Run bootstrapping on GPU.
///
/// It handles memory copy of the different arguments from CPU to GPU, and
/// freeing memory, except for the bootstrapping key, which should already be in
/// GPU.
///
/// \param out_allocated
/// \param out_aligned
/// \param out_offset
/// \param out_size
/// \param out_stride
/// \param ct0_allocated
/// \param ct0_aligned
/// \param ct0_offset
/// \param ct0_size
/// \param ct0_stride
/// \param tlu_allocated
/// \param tlu_aligned
/// \param tlu_offset
/// \param tlu_size
/// \param tlu_stride
/// \param input_lwe_dim LWE input dimension
/// \param poly_size polynomial size
/// \param level level
/// \param base_log base log
/// \param bsk pointer to bsk on GPU
void memref_bootstrap_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned,
uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride,
uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
uint32_t base_log, void *bsk);
/// \brief Copy ciphertext from CPU to GPU using a single stream.
///
/// It handles memory allocation on GPU.
///
/// \param ct_allocated
/// \param ct_aligned
/// \param ct_offset
/// \param ct_size
/// \param ct_stride
/// \param gpu_idx index of the GPU to use
/// \return void* pointer to the GPU ciphertext
void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
uint32_t gpu_idx);
/// \brief Copy ciphertext from GPU to CPU using a single stream.
///
/// Memory on GPU won't be freed after the copy.
///
/// \param out_allocated
/// \param out_aligned
/// \param out_offset
/// \param out_size
/// \param out_stride
/// \param ct_gpu
/// \param size
/// \param gpu_idx index of the GPU to use
void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
void *ct_gpu, size_t size, uint32_t gpu_idx);
/// \brief Copy bootstrapping key from CPU to GPU using a single stream.
///
/// It handles memory allocation on GPU.
///
/// \param context
/// \param gpu_idx index of the GPU to use
/// \return void* pointer to the GPU bsk
void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t gpu_idx);
/// \brief Free gpu memory.
///
/// \param gpu_ptr pointer to the GPU memory to free
/// \param gpu_idx index of the GPU to use
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx);
}
#endif

View File

@@ -54,6 +54,8 @@ struct CompilationOptions {
bool dataflowParallelize;
bool asyncOffload;
bool optimizeConcrete;
/// use GPU during execution by generating GPU operations if possible
bool useGPU;
llvm::Optional<std::vector<int64_t>> fhelinalgTileSizes;
llvm::Optional<std::string> clientParametersFuncName;
@@ -64,7 +66,7 @@ struct CompilationOptions {
: v0FHEConstraints(llvm::None), verifyDiagnostics(false),
autoParallelize(false), loopParallelize(false),
dataflowParallelize(false), asyncOffload(false), optimizeConcrete(true),
clientParametersFuncName(llvm::None),
useGPU(false), clientParametersFuncName(llvm::None),
optimizerConfig(optimizer::DEFAULT_CONFIG){};
CompilationOptions(std::string funcname) : CompilationOptions() {

View File

@@ -57,6 +57,10 @@ mlir::LogicalResult asyncOffload(mlir::MLIRContext &context,
mlir::ModuleOp &module,
std::function<bool(mlir::Pass *)> enablePass);
mlir::LogicalResult
transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module,
std::function<bool(mlir::Pass *)> enablePass);
mlir::LogicalResult
lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module,
std::function<bool(mlir::Pass *)> enablePass);

View File

@@ -0,0 +1,119 @@
// Part of the Concrete Compiler Project, under the BSD3 License with Zama
// Exceptions. See
// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
// for license information.
#include <mlir/Dialect/Func/IR/FuncOps.h>
#include <mlir/Pass/Pass.h>
#include <mlir/Transforms/DialectConversion.h>
#include "concretelang/Conversion/Passes.h"
#include "concretelang/Conversion/Tools.h"
#include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.h"
#include "concretelang/Dialect/BConcrete/IR/BConcreteOps.h"
char move_bsk_to_gpu[] = "move_bsk_to_gpu";
char free_from_gpu[] = "free_from_gpu";
/// \brief Rewrites `BConcrete.move_bsk_to_gpu` into a CAPI call to
/// `move_bsk_to_gpu`
///
/// Also insert the forward declaration of `move_bsk_to_gpu`
struct MoveBskOpPattern : public mlir::OpRewritePattern<
mlir::concretelang::BConcrete::MoveBskToGPUOp> {
MoveBskOpPattern(::mlir::MLIRContext *context,
mlir::PatternBenefit benefit = 1)
: ::mlir::OpRewritePattern<mlir::concretelang::BConcrete::MoveBskToGPUOp>(
context, benefit) {}
::mlir::LogicalResult
matchAndRewrite(mlir::concretelang::BConcrete::MoveBskToGPUOp moveBskOp,
::mlir::PatternRewriter &rewriter) const override {
auto ctx = getContextArgument(moveBskOp);
mlir::SmallVector<mlir::Value> operands{ctx};
// Insert forward declaration of the function
auto contextType =
mlir::concretelang::Concrete::ContextType::get(rewriter.getContext());
auto funcType = mlir::FunctionType::get(
rewriter.getContext(), {contextType},
{mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type())});
if (insertForwardDeclaration(moveBskOp, rewriter, move_bsk_to_gpu, funcType)
.failed()) {
return mlir::failure();
}
rewriter.replaceOpWithNewOp<mlir::func::CallOp>(
moveBskOp, move_bsk_to_gpu, moveBskOp.getResult().getType(), operands);
return ::mlir::success();
};
};
/// \brief Rewrites `BConcrete.free_bsk_from_gpu` into a CAPI call to
/// `free_from_gpu`
///
/// Also insert the forward declaration of `free_from_gpu`
struct FreeBskOpPattern : public mlir::OpRewritePattern<
mlir::concretelang::BConcrete::FreeBskFromGPUOp> {
FreeBskOpPattern(::mlir::MLIRContext *context,
mlir::PatternBenefit benefit = 1)
: ::mlir::OpRewritePattern<
mlir::concretelang::BConcrete::FreeBskFromGPUOp>(context, benefit) {
}
::mlir::LogicalResult
matchAndRewrite(mlir::concretelang::BConcrete::FreeBskFromGPUOp freeBskOp,
::mlir::PatternRewriter &rewriter) const override {
mlir::SmallVector<mlir::Value> operands{freeBskOp.bsk()};
// Insert forward declaration of the function
auto funcType = mlir::FunctionType::get(
rewriter.getContext(),
{mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type())}, {});
if (insertForwardDeclaration(freeBskOp, rewriter, free_from_gpu, funcType)
.failed()) {
return mlir::failure();
}
rewriter.replaceOpWithNewOp<mlir::func::CallOp>(
freeBskOp, free_from_gpu, mlir::TypeRange({}), operands);
return ::mlir::success();
};
};
namespace {
struct BConcreteToCAPIPass : public BConcreteToCAPIBase<BConcreteToCAPIPass> {
void runOnOperation() final;
};
} // namespace
void BConcreteToCAPIPass::runOnOperation() {
auto op = this->getOperation();
mlir::ConversionTarget target(getContext());
mlir::RewritePatternSet patterns(&getContext());
target.addIllegalOp<mlir::concretelang::BConcrete::MoveBskToGPUOp>();
target.addLegalDialect<mlir::func::FuncDialect>();
patterns.insert<MoveBskOpPattern>(&getContext());
patterns.insert<FreeBskOpPattern>(&getContext());
// Apply conversion
if (mlir::applyPartialConversion(op, target, std::move(patterns)).failed()) {
this->signalPassFailure();
}
}
namespace mlir {
namespace concretelang {
std::unique_ptr<OperationPass<ModuleOp>> createConvertBConcreteToCAPIPass() {
return std::make_unique<BConcreteToCAPIPass>();
}
} // namespace concretelang
} // namespace mlir

View File

@@ -0,0 +1,15 @@
add_mlir_dialect_library(BConcreteToCAPI
BConcreteToCAPI.cpp
ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/BConcrete
DEPENDS
BConcreteDialect
mlir-headers
LINK_LIBS PUBLIC
MLIRIR
MLIRTransforms)
target_link_libraries(BConcreteToCAPI PUBLIC BConcreteDialect MLIRIR)

View File

@@ -3,6 +3,8 @@ add_subdirectory(TFHEGlobalParametrization)
add_subdirectory(TFHEToConcrete)
add_subdirectory(FHETensorOpsToLinalg)
add_subdirectory(ConcreteToBConcrete)
add_subdirectory(ConcreteToGPU)
add_subdirectory(BConcreteToCAPI)
add_subdirectory(MLIRLowerableDialectsToLLVM)
add_subdirectory(LinalgExtras)

View File

@@ -9,6 +9,7 @@
#include <mlir/Dialect/Affine/IR/AffineOps.h>
#include <mlir/Dialect/Bufferization/IR/Bufferization.h>
#include <mlir/Dialect/Func/IR/FuncOps.h>
#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
#include <mlir/Dialect/Linalg/IR/Linalg.h>
#include <mlir/Dialect/SCF/IR/SCF.h>
#include <mlir/Dialect/Tensor/IR/Tensor.h>
@@ -64,6 +65,10 @@ class ConcreteToBConcreteTypeConverter : public mlir::TypeConverter {
public:
ConcreteToBConcreteTypeConverter() {
addConversion([](mlir::Type type) { return type; });
addConversion([&](mlir::concretelang::Concrete::GPUBskType type) {
return mlir::LLVM::LLVMPointerType::get(
mlir::IntegerType::get(type.getContext(), 64));
});
addConversion([&](mlir::concretelang::Concrete::PlaintextType type) {
return mlir::IntegerType::get(type.getContext(), 64);
});
@@ -160,28 +165,34 @@ struct LowToBConcrete : public mlir::OpRewritePattern<ConcreteOp> {
matchAndRewrite(ConcreteOp concreteOp,
::mlir::PatternRewriter &rewriter) const override {
ConcreteToBConcreteTypeConverter converter;
mlir::concretelang::Concrete::LweCiphertextType resultTy =
((mlir::Type)concreteOp->getResult(0).getType())
.cast<mlir::concretelang::Concrete::LweCiphertextType>();
auto newResultTy =
converter.convertType(resultTy).cast<mlir::RankedTensorType>();
mlir::TypeRange resultTyRange = concreteOp->getResultTypes();
llvm::ArrayRef<::mlir::NamedAttribute> attributes =
concreteOp.getOperation()->getAttrs();
auto crt = resultTy.getCrtDecomposition();
mlir::Operation *bConcreteOp;
if (crt.empty()) {
bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteOp>(
concreteOp, newResultTy, concreteOp.getOperation()->getOperands(),
attributes);
if (resultTyRange.size() == 1 &&
resultTyRange.front()
.isa<mlir::concretelang::Concrete::LweCiphertextType>()) {
auto crt = resultTyRange.front()
.cast<mlir::concretelang::Concrete::LweCiphertextType>()
.getCrtDecomposition();
if (crt.empty()) {
bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteOp>(
concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(),
attributes);
} else {
auto newAttributes = attributes.vec();
newAttributes.push_back(rewriter.getNamedAttr(
"crtDecomposition", rewriter.getI64ArrayAttr(crt)));
bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteCRTOp>(
concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(),
newAttributes);
}
} else {
auto newAttributes = attributes.vec();
newAttributes.push_back(rewriter.getNamedAttr(
"crtDecomposition", rewriter.getI64ArrayAttr(crt)));
bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteCRTOp>(
concreteOp, newResultTy, concreteOp.getOperation()->getOperands(),
newAttributes);
bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteOp>(
concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(),
attributes);
}
mlir::concretelang::convertOperandAndResultTypes(
@@ -906,7 +917,16 @@ void ConcreteToBConcretePass::runOnOperation() {
mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweOp,
mlir::concretelang::BConcrete::BootstrapLweBufferOp,
mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
mlir::concretelang::BConcrete::BootstrapLweBufferOp>,
LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweGPUOp,
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>,
LowToBConcrete<mlir::concretelang::Concrete::MoveBskToGPUOp,
mlir::concretelang::BConcrete::MoveBskToGPUOp,
mlir::concretelang::BConcrete::MoveBskToGPUOp>,
LowToBConcrete<mlir::concretelang::Concrete::FreeBskFromGPUOp,
mlir::concretelang::BConcrete::FreeBskFromGPUOp,
mlir::concretelang::BConcrete::FreeBskFromGPUOp>,
LowToBConcrete<Concrete::WopPBSLweOp, BConcrete::WopPBSCRTLweBufferOp,
BConcrete::WopPBSCRTLweBufferOp>>(&getContext());

View File

@@ -0,0 +1,16 @@
add_mlir_dialect_library(ConcreteToGPU
ConcreteToGPU.cpp
ADDITIONAL_HEADER_DIRS
${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/Concrete
DEPENDS
ConcreteDialect
mlir-headers
LINK_LIBS PUBLIC
MLIRIR
MLIRTransforms
)
target_link_libraries(ConcreteToGPU PUBLIC ConcreteDialect MLIRIR)

View File

@@ -0,0 +1,108 @@
// Part of the Concrete Compiler Project, under the BSD3 License with Zama
// Exceptions. See
// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
// for license information.
#include <mlir/Pass/Pass.h>
#include <mlir/Transforms/DialectConversion.h>
#include "concretelang/Conversion/Passes.h"
#include "concretelang/Dialect/Concrete/IR/ConcreteDialect.h"
#include "concretelang/Dialect/Concrete/IR/ConcreteOps.h"
#include "concretelang/Dialect/Concrete/IR/ConcreteTypes.h"
/// This rewrite pattern transforms any instance of `Concrete.bootstrap_lwe`
/// into `Concrete.bootstrap_lwe_gpu`. It also inserts operations to allocate
/// memory, copy bsk into GPU, and free memory after bootstrapping.
struct BstOpPattern : public mlir::OpRewritePattern<
mlir::concretelang::Concrete::BootstrapLweOp> {
BstOpPattern(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1)
: ::mlir::OpRewritePattern<mlir::concretelang::Concrete::BootstrapLweOp>(
context, benefit) {}
::mlir::LogicalResult
matchAndRewrite(mlir::concretelang::Concrete::BootstrapLweOp bstOp,
::mlir::PatternRewriter &rewriter) const override {
auto baselog = bstOp.baseLog();
auto level = bstOp.level();
mlir::Value ct = bstOp.input_ciphertext();
auto ctType =
ct.getType().cast<mlir::concretelang::Concrete::LweCiphertextType>();
auto inputLweDim = ctType.getDimension();
auto outType = bstOp.getResult()
.getType()
.cast<mlir::concretelang::Concrete::LweCiphertextType>();
auto outputLweDim = outType.getDimension();
// copy bsk into GPU
mlir::Value bskGPU =
rewriter
.create<mlir::concretelang::Concrete::MoveBskToGPUOp>(
bstOp.getLoc(), mlir::concretelang::Concrete::GPUBskType::get(
rewriter.getContext()))
.getResult();
mlir::Value inputLweDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
bstOp.getLoc(), inputLweDim, 32);
mlir::Value polySizeCst = rewriter.create<mlir::arith::ConstantIntOp>(
bstOp.getLoc(), outputLweDim, 32);
mlir::Value levelCst =
rewriter.create<mlir::arith::ConstantIntOp>(bstOp.getLoc(), level, 32);
mlir::Value baselogCst = rewriter.create<mlir::arith::ConstantIntOp>(
bstOp.getLoc(), baselog, 32);
mlir::Type tableType =
mlir::RankedTensorType::get({4}, rewriter.getI64Type());
mlir::Value tableCst = rewriter.create<mlir::arith::ConstantOp>(
bstOp.getLoc(),
mlir::DenseIntElementsAttr::get(
tableType, {llvm::APInt(64, 0), llvm::APInt(64, 0),
llvm::APInt(64, 0), llvm::APInt(64, 0)}));
rewriter
.replaceOpWithNewOp<mlir::concretelang::Concrete::BootstrapLweGPUOp>(
bstOp, outType, ct, tableCst, inputLweDimCst, polySizeCst, levelCst,
baselogCst, bskGPU);
// free bsk memory from GPU
rewriter.create<mlir::concretelang::Concrete::FreeBskFromGPUOp>(
bstOp.getLoc(), bskGPU);
return ::mlir::success();
};
};
namespace {
struct ConcreteToGPUPass : public ConcreteToGPUBase<ConcreteToGPUPass> {
void runOnOperation() final;
};
} // namespace
void ConcreteToGPUPass::runOnOperation() {
auto op = this->getOperation();
mlir::ConversionTarget target(getContext());
mlir::RewritePatternSet patterns(&getContext());
target.addLegalDialect<mlir::concretelang::Concrete::ConcreteDialect,
mlir::arith::ArithmeticDialect>();
target.addIllegalOp<mlir::concretelang::Concrete::BootstrapLweOp>();
patterns.insert<BstOpPattern>(&getContext());
// Apply conversion
if (mlir::applyPartialConversion(op, target, std::move(patterns)).failed()) {
this->signalPassFailure();
}
}
namespace mlir {
namespace concretelang {
std::unique_ptr<OperationPass<ModuleOp>> createConvertConcreteToGPUPass() {
return std::make_unique<ConcreteToGPUPass>();
}
} // namespace concretelang
} // namespace mlir

View File

@@ -6,6 +6,7 @@
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "concretelang/Conversion/Tools.h"
#include "concretelang/Dialect/Concrete/IR/ConcreteTypes.h"
mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
mlir::OpBuilder &rewriter,
@@ -35,3 +36,27 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
mlir::SymbolTable::lookupSymbolIn(module, funcName)));
return mlir::success();
}
/// Returns the value of the context argument from the enclosing func
mlir::Value getContextArgument(mlir::Operation *op) {
mlir::Block *block = op->getBlock();
while (block != nullptr) {
if (llvm::isa<mlir::func::FuncOp>(block->getParentOp())) {
auto context = std::find_if(
block->getArguments().rbegin(), block->getArguments().rend(),
[](mlir::BlockArgument &arg) {
return arg.getType()
.isa<mlir::concretelang::Concrete::ContextType>();
});
assert(context != block->getArguments().rend() &&
"Cannot find the Concrete.context");
return *context;
}
block = block->getParentOp()->getBlock();
}
assert("can't find a function that enclose the op");
return nullptr;
}

View File

@@ -7,6 +7,7 @@
#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -75,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64";
char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64";
char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
char memref_await_future[] = "memref_await_future";
char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
char memref_expand_lut_in_trivial_glwe_ct_u64[] =
"memref_expand_lut_in_trivial_glwe_ct_u64";
@@ -89,6 +91,9 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
mlir::concretelang::RT::FutureType::get(rewriter.getIndexType());
auto contextType =
mlir::concretelang::Concrete::ContextType::get(rewriter.getContext());
auto i32Type = rewriter.getI32Type();
auto i64PointerType = mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type());
mlir::FunctionType funcType;
if (funcName == memref_add_lwe_ciphertexts_u64) {
@@ -124,6 +129,12 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
funcType = mlir::FunctionType::get(
rewriter.getContext(),
{memref1DType, futureType, memref1DType, memref1DType}, {});
} else if (funcName == memref_bootstrap_lwe_cuda_u64) {
funcType = mlir::FunctionType::get(rewriter.getContext(),
{memref1DType, memref1DType,
memref1DType, i32Type, i32Type, i32Type,
i32Type, i64PointerType},
{});
} else if (funcName == memref_expand_lut_in_trivial_glwe_ct_u64) {
funcType = mlir::FunctionType::get(rewriter.getContext(),
{
@@ -156,32 +167,6 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
return insertForwardDeclaration(op, rewriter, funcName, funcType);
}
/// Returns the value of the context argument from the enclosing func
mlir::Value getContextArgument(mlir::Operation *op) {
mlir::Block *block = op->getBlock();
while (block != nullptr) {
if (llvm::isa<mlir::func::FuncOp>(block->getParentOp())) {
block = &mlir::cast<mlir::func::FuncOp>(block->getParentOp())
.getBody()
.front();
auto context =
std::find_if(block->getArguments().rbegin(),
block->getArguments().rend(), [](BlockArgument &arg) {
return arg.getType()
.isa<mlir::concretelang::Concrete::ContextType>();
});
assert(context != block->getArguments().rend() &&
"Cannot find the Concrete.context");
return *context;
}
block = block->getParentOp()->getBlock();
}
assert("can't find a function that enclose the op");
return nullptr;
};
template <typename Op>
void pushAdditionalArgs(Op op, mlir::SmallVector<mlir::Value> &operands,
RewriterBase &rewriter);
@@ -578,6 +563,10 @@ void mlir::concretelang::BConcrete::
BufferizableWithCallOpInterface<BConcrete::NegateLweBufferOp,
memref_negate_lwe_ciphertext_u64>>(
*ctx);
BConcrete::BootstrapLweGPUBufferOp::attachInterface<
BufferizableWithCallOpInterface<BConcrete::BootstrapLweGPUBufferOp,
memref_bootstrap_lwe_cuda_u64, false>>(
*ctx);
BConcrete::KeySwitchLweBufferOp::attachInterface<
BufferizableWithCallOpInterface<BConcrete::KeySwitchLweBufferOp,
memref_keyswitch_lwe_u64>>(*ctx);

View File

@@ -18,6 +18,16 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
)
endif()
if(CONCRETELANG_CUDA_SUPPORT)
target_link_libraries(
ConcretelangRuntime
PRIVATE
ConcreteCUDA
-L/usr/local/cuda/lib64
cudart
)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
target_link_libraries(ConcretelangRuntime PUBLIC omp)
else()

View File

@@ -57,6 +57,122 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size,
#include "concretelang/ClientLib/CRT.h"
#include "concretelang/Runtime/wrappers.h"
#ifdef CONCRETELANG_CUDA_SUPPORT
// We need to define the double2 struct from the CUDA backend header files
// This shouldn't be defined here, but included along with concrete-cuda header
// files
typedef struct double2 {
double x, y;
} double2;
#include "bootstrap.h"
#include "device.h"
void memref_keyswitch_lwe_cuda_u64(uint64_t *out_allocated,
uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride,
uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset,
uint64_t ct0_size, uint64_t ct0_stride,
void *ksk_gpu) {
// TODO: GPU implementation
}
void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
uint32_t gpu_idx) {
void *stream = cuda_create_stream(gpu_idx);
void *ct_gpu = cuda_malloc(ct_size * sizeof(uint64_t), gpu_idx);
cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset,
ct_size * sizeof(uint64_t), stream, gpu_idx);
cuda_synchronize_device(gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
return ct_gpu;
}
void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t gpu_idx = 0) {
void *stream = cuda_create_stream(gpu_idx);
LweBootstrapKey_u64 *bsk = get_bootstrap_key_u64(context);
BufferView bskBuffer = bootstrap_buffer_lwe_u64(bsk);
void *bsk_gpu = cuda_malloc(bskBuffer.length, gpu_idx);
cuda_memcpy_async_to_gpu(bsk_gpu, (void *)bskBuffer.pointer, bskBuffer.length,
stream, gpu_idx);
cuda_synchronize_device(gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
return bsk_gpu;
}
void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
void *ct_gpu, size_t size, uint32_t gpu_idx) {
void *stream = cuda_create_stream(gpu_idx);
cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
size * sizeof(uint64_t), stream, gpu_idx);
cuda_synchronize_device(gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
}
void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
cuda_drop(gpu_ptr, gpu_idx);
}
void memref_bootstrap_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned,
uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride,
uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
uint32_t base_log, void *bsk_gpu) {
uint32_t gpu_idx = 0;
void *stream = cuda_create_stream(gpu_idx);
// move input ciphertext into gpu
void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
ct0_size, ct0_stride, gpu_idx);
// move output ciphertext into gpu
void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset,
out_size, out_stride, gpu_idx);
// hardcoded values
uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0;
void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t));
((uint32_t *)test_vector_idxes)[0] = 0;
void *test_vector = malloc(poly_size * sizeof(uint64_t));
for (size_t i = 0; i < poly_size; i++) {
((uint64_t *)test_vector)[i] = (uint64_t)1 << 61;
}
// move test vector into gpu
void *test_vector_gpu = cuda_malloc(poly_size * sizeof(uint64_t), gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_gpu, test_vector,
poly_size * sizeof(uint64_t), stream, gpu_idx);
// move test vector indexes into gpu
void *test_vector_idxes_gpu =
cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
num_samples * sizeof(uint32_t), stream, gpu_idx);
// run gpu bootstrap
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu, bsk_gpu,
input_lwe_dim, poly_size, base_log, level, num_samples, num_test_vectors,
lwe_idx, cuda_get_max_shared_memory(gpu_idx));
// copy output ciphertext back to cpu
move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride,
out_gpu, out_size, gpu_idx);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_drop(test_vector_gpu, gpu_idx);
cuda_drop(test_vector_idxes_gpu, gpu_idx);
cuda_destroy_stream(stream, gpu_idx);
}
#endif
void memref_expand_lut_in_trivial_glwe_ct_u64(
uint64_t *glwe_ct_allocated, uint64_t *glwe_ct_aligned,
uint64_t glwe_ct_offset, uint64_t glwe_ct_size, uint64_t glwe_ct_stride,

View File

@@ -317,6 +317,14 @@ CompilerEngine::compile(llvm::SourceMgr &sm, Target target, OptionalLib lib) {
return errorDiag("Optimizing Concrete failed");
}
// Transforming into GPU
if (this->compilerOptions.useGPU &&
mlir::concretelang::pipeline::transformsConcreteToGPU(mlirContext, module,
this->enablePass)
.failed()) {
return errorDiag("Transforming Concrete to GPU failed");
}
if (target == Target::CONCRETE)
return std::move(res);

View File

@@ -239,6 +239,16 @@ optimizeConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
return pm.run(module.getOperation());
}
mlir::LogicalResult
transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module,
std::function<bool(mlir::Pass *)> enablePass) {
mlir::PassManager pm(&context);
pipelinePrinting("ConcreteToGPU", pm, context);
addPotentiallyNestedPass(
pm, mlir::concretelang::createConvertConcreteToGPUPass(), enablePass);
return pm.run(module.getOperation());
}
mlir::LogicalResult
lowerConcreteToBConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
std::function<bool(mlir::Pass *)> enablePass,
@@ -283,6 +293,8 @@ lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module,
enablePass);
addPotentiallyNestedPass(pm, mlir::concretelang::createAddRuntimeContext(),
enablePass);
addPotentiallyNestedPass(
pm, mlir::concretelang::createConvertBConcreteToCAPIPass(), enablePass);
return pm.run(module.getOperation());
}

View File

@@ -98,6 +98,12 @@ llvm::cl::opt<bool>
"dialects. (Enabled by default)"),
llvm::cl::init<bool>(true));
llvm::cl::opt<bool>
useGPU("use-gpu",
llvm::cl::desc("enable/disable generating concrete GPU "
"operations (Disabled by default)"),
llvm::cl::init<bool>(false));
llvm::cl::list<std::string> passes(
"passes",
llvm::cl::desc("Specify the passes to run (use only for compiler tests)"),
@@ -283,6 +289,7 @@ cmdlineCompilationOptions() {
options.loopParallelize = cmdline::loopParallelize;
options.dataflowParallelize = cmdline::dataflowParallelize;
options.optimizeConcrete = cmdline::optimizeConcrete;
options.useGPU = cmdline::useGPU;
if (!cmdline::v0Constraint.empty()) {
if (cmdline::v0Constraint.size() != 2) {