feat: support GPU (bootstrapping)

2026-04-17 03:00:54 -04:00 · 2022-07-21 14:45:28 +01:00
parent a487b03699
commit d169a27fc0
26 changed files with 715 additions and 47 deletions
--- a/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h
+++ b/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h
@@ -0,0 +1,18 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_
+#define ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace concretelang {
+/// Create a pass to convert `BConcrete` dialect to CAPI calls.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertBConcreteToCAPIPass();
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h
+++ b/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h
@@ -0,0 +1,18 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_
+#define ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace concretelang {
+/// Create a pass to convert `Concrete` operations to GPU.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertConcreteToGPUPass();
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compiler/include/concretelang/Conversion/Passes.h
+++ b/compiler/include/concretelang/Conversion/Passes.h
@@ -13,7 +13,9 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"

+#include "concretelang/Conversion/BConcreteToCAPI/Pass.h"
 #include "concretelang/Conversion/ConcreteToBConcrete/Pass.h"
+#include "concretelang/Conversion/ConcreteToGPU/Pass.h"
 #include "concretelang/Conversion/FHETensorOpsToLinalg/Pass.h"
 #include "concretelang/Conversion/FHEToTFHE/Pass.h"
 #include "concretelang/Conversion/LinalgExtras/Passes.h"
--- a/compiler/include/concretelang/Conversion/Passes.td
+++ b/compiler/include/concretelang/Conversion/Passes.td
@@ -47,6 +47,20 @@ def ConcreteToBConcrete : Pass<"concrete-to-bconcrete", "mlir::ModuleOp"> {
  let dependentDialects = ["mlir::linalg::LinalgDialect", "mlir::concretelang::Concrete::ConcreteDialect", "mlir::concretelang::BConcrete::BConcreteDialect"];
 }

+def BConcreteToCAPI : Pass<"bconcrete-to-capi", "mlir::ModuleOp"> {
+  let summary = "Lowers operations from the BConcrete dialect to CAPI calls";
+  let description = [{ Lowers operations from the BConcrete dialect to CAPI calls }];
+  let constructor = "mlir::concretelang::createConvertBConcreteToCAPIPass()";
+  let dependentDialects = ["mlir::concretelang::BConcrete::BConcreteDialect"];
+}
+
+def ConcreteToGPU : Pass<"concrete-to-gpu", "mlir::ModuleOp"> {
+  let summary = "Transforms operations in the Concrete dialect to GPU";
+  let description = [{ Transforms operations in the Concrete dialect to GPU }];
+  let constructor = "mlir::concretelang::createConvertConcreteToGPUPass()";
+  let dependentDialects = ["mlir::concretelang::Concrete::ConcreteDialect"];
+}
+
 def MLIRLowerableDialectsToLLVM : Pass<"mlir-lowerable-dialects-to-llvm", "mlir::ModuleOp"> {
  let summary = "Lowers operations from MLIR lowerable dialects to LLVM";
  let constructor = "mlir::concretelang::createConvertMLIRLowerableDialectsToLLVMPass()";
--- a/compiler/include/concretelang/Conversion/Tools.h
+++ b/compiler/include/concretelang/Conversion/Tools.h
@@ -9,3 +9,9 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
                                             mlir::OpBuilder &rewriter,
                                             llvm::StringRef funcName,
                                             mlir::FunctionType funcType);
+
+/// \brief Returns the value of the context argument from the enclosing func
+///
+/// \param op initial operation to start the search from
+/// \return mlir::Value the context value
+mlir::Value getContextArgument(mlir::Operation *op);
--- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h
+++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h
@@ -6,6 +6,7 @@
 #ifndef ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H
 #define ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H

+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/BuiltinTypes.h>
--- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
+++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
@@ -5,6 +5,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/BuiltinTypes.td"
 include "mlir/Dialect/MemRef/IR/MemRefBase.td"
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"

 include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.td"
 include "concretelang/Dialect/Concrete/IR/ConcreteTypes.td"
@@ -157,4 +158,27 @@ def BConcrete_AwaitFutureOp :
    let results = (outs 1DTensorOf<[I64]>:$result);
 }

+def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer"> {
+    let arguments = (ins
+        1DTensorOf<[I64]>:$input_ciphertext,
+        1DTensorOf<[I64]>:$table,
+        I32:$inputLweDim,
+        I32:$polySize,
+        I32:$level,
+        I32:$baseLog,
+        LLVM_PointerTo<I64>:$bsk
+    );
+    let results = (outs 1DTensorOf<[I64]>:$result);
+}
+
+def BConcrete_MoveBskToGPUOp : BConcrete_Op<"move_bsk_to_gpu"> {
+    let arguments = (ins);
+    let results = (outs LLVM_PointerTo<I64>:$bsk);
+}
+
+def BConcrete_FreeBskFromGPUOp : BConcrete_Op<"free_bsk_from_gpu"> {
+    let arguments = (ins LLVM_PointerTo<I64>:$bsk);
+    let results = (outs);
+}
+
 #endif
--- a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td
+++ b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td
@@ -52,7 +52,7 @@ def Concrete_NegateLweCiphertextOp : Concrete_Op<"negate_lwe_ciphertext"> {
    let results = (outs Concrete_LweCiphertextType:$result);
 }

-def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table"> {
+def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table", [NoSideEffect]> {
    let summary = "Creates a GLWE ciphertext which is the trivial encrytion of a the input table interpreted as a polynomial (to use later in a bootstrap)";

    let arguments = (ins 1DTensorOf<[I64]>:$table);
@@ -71,6 +71,35 @@ def Concrete_BootstrapLweOp : Concrete_Op<"bootstrap_lwe"> {
    let results = (outs Concrete_LweCiphertextType:$result);
 }

+def Concrete_BootstrapLweGPUOp : Concrete_Op<"bootstrap_lwe_gpu"> {
+    let summary = "Bootstrap an LWE ciphertext in GPU using a lookup table";
+
+    let arguments = (ins
+        Concrete_LweCiphertextType:$input_ciphertext,
+        1DTensorOf<[I64]>:$table,
+        I32:$inputLweDim,
+        I32:$polySize,
+        I32:$level,
+        I32:$baseLog,
+        Concrete_GPUBsk:$bsk
+    );
+    let results = (outs Concrete_LweCiphertextType:$result);
+}
+
+def Concrete_MoveBskToGPUOp : Concrete_Op<"move_bsk_to_gpu"> {
+    let summary = "Move bsk to GPU";
+
+    let arguments = (ins);
+    let results = (outs Concrete_GPUBsk:$bsk);
+}
+
+def Concrete_FreeBskFromGPUOp : Concrete_Op<"free_bsk_from_gpu"> {
+    let summary = "Free bsk memory from GPU";
+
+    let arguments = (ins Concrete_GPUBsk:$bsk);
+    let results = (outs);
+}
+
 def Concrete_KeySwitchLweOp : Concrete_Op<"keyswitch_lwe"> {
    let summary = "Keyswitches a LWE ciphertext";

--- a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td
+++ b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td
@@ -93,4 +93,14 @@ def Concrete_Context : Concrete_Type<"Context"> {
    }];
 }

+def Concrete_GPUBsk : Concrete_Type<"GPUBsk"> {
+    let mnemonic = "gpu_bsk";
+
+    let summary = "A bsk in GPU";
+
+    let description = [{
+       A bootstrapping key in GPU memory
+    }];
+}
+
 #endif
--- a/compiler/include/concretelang/Runtime/wrappers.h
+++ b/compiler/include/concretelang/Runtime/wrappers.h
@@ -105,6 +105,87 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
                          uint64_t src_stride, uint64_t *dst_allocated,
                          uint64_t *dst_aligned, uint64_t dst_offset,
                          uint64_t dst_size, uint64_t dst_stride);
-}

+/// \brief Run bootstrapping on GPU.
+///
+/// It handles memory copy of the different arguments from CPU to GPU, and
+/// freeing memory, except for the bootstrapping key, which should already be in
+/// GPU.
+///
+/// \param out_allocated
+/// \param out_aligned
+/// \param out_offset
+/// \param out_size
+/// \param out_stride
+/// \param ct0_allocated
+/// \param ct0_aligned
+/// \param ct0_offset
+/// \param ct0_size
+/// \param ct0_stride
+/// \param tlu_allocated
+/// \param tlu_aligned
+/// \param tlu_offset
+/// \param tlu_size
+/// \param tlu_stride
+/// \param input_lwe_dim LWE input dimension
+/// \param poly_size polynomial size
+/// \param level level
+/// \param base_log base log
+/// \param bsk pointer to bsk on GPU
+void memref_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
+    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
+    uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned,
+    uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, void *bsk);
+
+/// \brief Copy ciphertext from CPU to GPU using a single stream.
+///
+/// It handles memory allocation on GPU.
+///
+/// \param ct_allocated
+/// \param ct_aligned
+/// \param ct_offset
+/// \param ct_size
+/// \param ct_stride
+/// \param gpu_idx index of the GPU to use
+/// \return void* pointer to the GPU ciphertext
+void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
+                     uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
+                     uint32_t gpu_idx);
+
+/// \brief Copy ciphertext from GPU to CPU using a single stream.
+///
+/// Memory on GPU won't be freed after the copy.
+///
+/// \param out_allocated
+/// \param out_aligned
+/// \param out_offset
+/// \param out_size
+/// \param out_stride
+/// \param ct_gpu
+/// \param size
+/// \param gpu_idx index of the GPU to use
+void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
+                    uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
+                    void *ct_gpu, size_t size, uint32_t gpu_idx);
+
+/// \brief Copy bootstrapping key from CPU to GPU using a single stream.
+///
+/// It handles memory allocation on GPU.
+///
+/// \param context
+/// \param gpu_idx index of the GPU to use
+/// \return void*  pointer to the GPU bsk
+void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                      uint32_t gpu_idx);
+
+/// \brief Free gpu memory.
+///
+/// \param gpu_ptr pointer to the GPU memory to free
+/// \param gpu_idx index of the GPU to use
+void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx);
+}
 #endif
--- a/compiler/include/concretelang/Support/CompilerEngine.h
+++ b/compiler/include/concretelang/Support/CompilerEngine.h
@@ -54,6 +54,8 @@ struct CompilationOptions {
  bool dataflowParallelize;
  bool asyncOffload;
  bool optimizeConcrete;
+  /// use GPU during execution by generating GPU operations if possible
+  bool useGPU;
  llvm::Optional<std::vector<int64_t>> fhelinalgTileSizes;

  llvm::Optional<std::string> clientParametersFuncName;
@@ -64,7 +66,7 @@ struct CompilationOptions {
      : v0FHEConstraints(llvm::None), verifyDiagnostics(false),
        autoParallelize(false), loopParallelize(false),
        dataflowParallelize(false), asyncOffload(false), optimizeConcrete(true),
-        clientParametersFuncName(llvm::None),
+        useGPU(false), clientParametersFuncName(llvm::None),
        optimizerConfig(optimizer::DEFAULT_CONFIG){};

  CompilationOptions(std::string funcname) : CompilationOptions() {
--- a/compiler/include/concretelang/Support/Pipeline.h
+++ b/compiler/include/concretelang/Support/Pipeline.h
@@ -57,6 +57,10 @@ mlir::LogicalResult asyncOffload(mlir::MLIRContext &context,
                                 mlir::ModuleOp &module,
                                 std::function<bool(mlir::Pass *)> enablePass);

+mlir::LogicalResult
+transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module,
+                        std::function<bool(mlir::Pass *)> enablePass);
+
 mlir::LogicalResult
 lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module,
                    std::function<bool(mlir::Pass *)> enablePass);