feat: support GPU (bootstrapping)

2026-02-09 12:15:09 -05:00 · 2022-07-21 14:45:28 +01:00
parent a487b03699
commit d169a27fc0
26 changed files with 715 additions and 47 deletions
--- a/compiler/CMakeLists.txt
+++ b/compiler/CMakeLists.txt
@@ -55,7 +55,19 @@ include_directories(${CONCRETE_FFI_RELEASE})
 add_library(Concrete STATIC IMPORTED)
 set_target_properties(Concrete PROPERTIES IMPORTED_LOCATION ${CONCRETE_FFI_RELEASE}/libconcrete_core_ffi.a)

-# -------------------------------------------------------------------------------
+#--------------------------------------------------------------------------------
+# Concrete Cuda Configuration
+#--------------------------------------------------------------------------------
+option(CONCRETELANG_CUDA_SUPPORT "Support Concrete CUDA Execution." OFF)
+if(CONCRETELANG_CUDA_SUPPORT)
+  message(STATUS "Building with Concrete CUDA execution support")
+  include_directories(${CONCRETE_CORE_PATH}/concrete-cuda/cuda/include)
+  add_library(ConcreteCUDA STATIC IMPORTED)
+  set_target_properties(ConcreteCUDA PROPERTIES IMPORTED_LOCATION ${CONCRETE_CORE_PATH}/concrete-cuda/cuda/build/src/libconcrete_cuda.a )
+  add_compile_options(-DCONCRETELANG_CUDA_SUPPORT)
+endif()
+
+#--------------------------------------------------------------------------------
 # Python Configuration
 # -------------------------------------------------------------------------------
 option(CONCRETELANG_BINDINGS_PYTHON_ENABLED "Enables ConcreteLang Python bindings." ON)
--- a/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h
+++ b/compiler/include/concretelang/Conversion/BConcreteToCAPI/Pass.h
@@ -0,0 +1,18 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_
+#define ZAMALANG_CONVERSION_BCONCRETETOCAPI_PASS_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace concretelang {
+/// Create a pass to convert `BConcrete` dialect to CAPI calls.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertBConcreteToCAPIPass();
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h
+++ b/compiler/include/concretelang/Conversion/ConcreteToGPU/Pass.h
@@ -0,0 +1,18 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_
+#define ZAMALANG_CONVERSION_CONCRETETOGPU_PASS_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace concretelang {
+/// Create a pass to convert `Concrete` operations to GPU.
+std::unique_ptr<OperationPass<ModuleOp>> createConvertConcreteToGPUPass();
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compiler/include/concretelang/Conversion/Passes.h
+++ b/compiler/include/concretelang/Conversion/Passes.h
@@ -13,7 +13,9 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"

+#include "concretelang/Conversion/BConcreteToCAPI/Pass.h"
 #include "concretelang/Conversion/ConcreteToBConcrete/Pass.h"
+#include "concretelang/Conversion/ConcreteToGPU/Pass.h"
 #include "concretelang/Conversion/FHETensorOpsToLinalg/Pass.h"
 #include "concretelang/Conversion/FHEToTFHE/Pass.h"
 #include "concretelang/Conversion/LinalgExtras/Passes.h"
--- a/compiler/include/concretelang/Conversion/Passes.td
+++ b/compiler/include/concretelang/Conversion/Passes.td
@@ -47,6 +47,20 @@ def ConcreteToBConcrete : Pass<"concrete-to-bconcrete", "mlir::ModuleOp"> {
  let dependentDialects = ["mlir::linalg::LinalgDialect", "mlir::concretelang::Concrete::ConcreteDialect", "mlir::concretelang::BConcrete::BConcreteDialect"];
 }

+def BConcreteToCAPI : Pass<"bconcrete-to-capi", "mlir::ModuleOp"> {
+  let summary = "Lowers operations from the BConcrete dialect to CAPI calls";
+  let description = [{ Lowers operations from the BConcrete dialect to CAPI calls }];
+  let constructor = "mlir::concretelang::createConvertBConcreteToCAPIPass()";
+  let dependentDialects = ["mlir::concretelang::BConcrete::BConcreteDialect"];
+}
+
+def ConcreteToGPU : Pass<"concrete-to-gpu", "mlir::ModuleOp"> {
+  let summary = "Transforms operations in the Concrete dialect to GPU";
+  let description = [{ Transforms operations in the Concrete dialect to GPU }];
+  let constructor = "mlir::concretelang::createConvertConcreteToGPUPass()";
+  let dependentDialects = ["mlir::concretelang::Concrete::ConcreteDialect"];
+}
+
 def MLIRLowerableDialectsToLLVM : Pass<"mlir-lowerable-dialects-to-llvm", "mlir::ModuleOp"> {
  let summary = "Lowers operations from MLIR lowerable dialects to LLVM";
  let constructor = "mlir::concretelang::createConvertMLIRLowerableDialectsToLLVMPass()";
--- a/compiler/include/concretelang/Conversion/Tools.h
+++ b/compiler/include/concretelang/Conversion/Tools.h
@@ -9,3 +9,9 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
                                             mlir::OpBuilder &rewriter,
                                             llvm::StringRef funcName,
                                             mlir::FunctionType funcType);
+
+/// \brief Returns the value of the context argument from the enclosing func
+///
+/// \param op initial operation to start the search from
+/// \return mlir::Value the context value
+mlir::Value getContextArgument(mlir::Operation *op);
--- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h
+++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.h
@@ -6,6 +6,7 @@
 #ifndef ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H
 #define ZAMALANG_DIALECT_BConcrete_BConcrete_OPS_H

+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinOps.h>
 #include <mlir/IR/BuiltinTypes.h>
--- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
+++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
@@ -5,6 +5,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/BuiltinTypes.td"
 include "mlir/Dialect/MemRef/IR/MemRefBase.td"
+include "mlir/Dialect/LLVMIR/LLVMOpBase.td"

 include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.td"
 include "concretelang/Dialect/Concrete/IR/ConcreteTypes.td"
@@ -157,4 +158,27 @@ def BConcrete_AwaitFutureOp :
    let results = (outs 1DTensorOf<[I64]>:$result);
 }

+def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer"> {
+    let arguments = (ins
+        1DTensorOf<[I64]>:$input_ciphertext,
+        1DTensorOf<[I64]>:$table,
+        I32:$inputLweDim,
+        I32:$polySize,
+        I32:$level,
+        I32:$baseLog,
+        LLVM_PointerTo<I64>:$bsk
+    );
+    let results = (outs 1DTensorOf<[I64]>:$result);
+}
+
+def BConcrete_MoveBskToGPUOp : BConcrete_Op<"move_bsk_to_gpu"> {
+    let arguments = (ins);
+    let results = (outs LLVM_PointerTo<I64>:$bsk);
+}
+
+def BConcrete_FreeBskFromGPUOp : BConcrete_Op<"free_bsk_from_gpu"> {
+    let arguments = (ins LLVM_PointerTo<I64>:$bsk);
+    let results = (outs);
+}
+
 #endif
--- a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td
+++ b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteOps.td
@@ -52,7 +52,7 @@ def Concrete_NegateLweCiphertextOp : Concrete_Op<"negate_lwe_ciphertext"> {
    let results = (outs Concrete_LweCiphertextType:$result);
 }

-def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table"> {
+def Concrete_GlweFromTable : Concrete_Op<"glwe_from_table", [NoSideEffect]> {
    let summary = "Creates a GLWE ciphertext which is the trivial encrytion of a the input table interpreted as a polynomial (to use later in a bootstrap)";

    let arguments = (ins 1DTensorOf<[I64]>:$table);
@@ -71,6 +71,35 @@ def Concrete_BootstrapLweOp : Concrete_Op<"bootstrap_lwe"> {
    let results = (outs Concrete_LweCiphertextType:$result);
 }

+def Concrete_BootstrapLweGPUOp : Concrete_Op<"bootstrap_lwe_gpu"> {
+    let summary = "Bootstrap an LWE ciphertext in GPU using a lookup table";
+
+    let arguments = (ins
+        Concrete_LweCiphertextType:$input_ciphertext,
+        1DTensorOf<[I64]>:$table,
+        I32:$inputLweDim,
+        I32:$polySize,
+        I32:$level,
+        I32:$baseLog,
+        Concrete_GPUBsk:$bsk
+    );
+    let results = (outs Concrete_LweCiphertextType:$result);
+}
+
+def Concrete_MoveBskToGPUOp : Concrete_Op<"move_bsk_to_gpu"> {
+    let summary = "Move bsk to GPU";
+
+    let arguments = (ins);
+    let results = (outs Concrete_GPUBsk:$bsk);
+}
+
+def Concrete_FreeBskFromGPUOp : Concrete_Op<"free_bsk_from_gpu"> {
+    let summary = "Free bsk memory from GPU";
+
+    let arguments = (ins Concrete_GPUBsk:$bsk);
+    let results = (outs);
+}
+
 def Concrete_KeySwitchLweOp : Concrete_Op<"keyswitch_lwe"> {
    let summary = "Keyswitches a LWE ciphertext";

--- a/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td
+++ b/compiler/include/concretelang/Dialect/Concrete/IR/ConcreteTypes.td
@@ -93,4 +93,14 @@ def Concrete_Context : Concrete_Type<"Context"> {
    }];
 }

+def Concrete_GPUBsk : Concrete_Type<"GPUBsk"> {
+    let mnemonic = "gpu_bsk";
+
+    let summary = "A bsk in GPU";
+
+    let description = [{
+       A bootstrapping key in GPU memory
+    }];
+}
+
 #endif
--- a/compiler/include/concretelang/Runtime/wrappers.h
+++ b/compiler/include/concretelang/Runtime/wrappers.h
@@ -105,6 +105,87 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
                          uint64_t src_stride, uint64_t *dst_allocated,
                          uint64_t *dst_aligned, uint64_t dst_offset,
                          uint64_t dst_size, uint64_t dst_stride);
-}

+/// \brief Run bootstrapping on GPU.
+///
+/// It handles memory copy of the different arguments from CPU to GPU, and
+/// freeing memory, except for the bootstrapping key, which should already be in
+/// GPU.
+///
+/// \param out_allocated
+/// \param out_aligned
+/// \param out_offset
+/// \param out_size
+/// \param out_stride
+/// \param ct0_allocated
+/// \param ct0_aligned
+/// \param ct0_offset
+/// \param ct0_size
+/// \param ct0_stride
+/// \param tlu_allocated
+/// \param tlu_aligned
+/// \param tlu_offset
+/// \param tlu_size
+/// \param tlu_stride
+/// \param input_lwe_dim LWE input dimension
+/// \param poly_size polynomial size
+/// \param level level
+/// \param base_log base log
+/// \param bsk pointer to bsk on GPU
+void memref_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
+    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
+    uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned,
+    uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, void *bsk);
+
+/// \brief Copy ciphertext from CPU to GPU using a single stream.
+///
+/// It handles memory allocation on GPU.
+///
+/// \param ct_allocated
+/// \param ct_aligned
+/// \param ct_offset
+/// \param ct_size
+/// \param ct_stride
+/// \param gpu_idx index of the GPU to use
+/// \return void* pointer to the GPU ciphertext
+void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
+                     uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
+                     uint32_t gpu_idx);
+
+/// \brief Copy ciphertext from GPU to CPU using a single stream.
+///
+/// Memory on GPU won't be freed after the copy.
+///
+/// \param out_allocated
+/// \param out_aligned
+/// \param out_offset
+/// \param out_size
+/// \param out_stride
+/// \param ct_gpu
+/// \param size
+/// \param gpu_idx index of the GPU to use
+void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
+                    uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
+                    void *ct_gpu, size_t size, uint32_t gpu_idx);
+
+/// \brief Copy bootstrapping key from CPU to GPU using a single stream.
+///
+/// It handles memory allocation on GPU.
+///
+/// \param context
+/// \param gpu_idx index of the GPU to use
+/// \return void*  pointer to the GPU bsk
+void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                      uint32_t gpu_idx);
+
+/// \brief Free gpu memory.
+///
+/// \param gpu_ptr pointer to the GPU memory to free
+/// \param gpu_idx index of the GPU to use
+void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx);
+}
 #endif
--- a/compiler/include/concretelang/Support/CompilerEngine.h
+++ b/compiler/include/concretelang/Support/CompilerEngine.h
@@ -54,6 +54,8 @@ struct CompilationOptions {
  bool dataflowParallelize;
  bool asyncOffload;
  bool optimizeConcrete;
+  /// use GPU during execution by generating GPU operations if possible
+  bool useGPU;
  llvm::Optional<std::vector<int64_t>> fhelinalgTileSizes;

  llvm::Optional<std::string> clientParametersFuncName;
@@ -64,7 +66,7 @@ struct CompilationOptions {
      : v0FHEConstraints(llvm::None), verifyDiagnostics(false),
        autoParallelize(false), loopParallelize(false),
        dataflowParallelize(false), asyncOffload(false), optimizeConcrete(true),
-        clientParametersFuncName(llvm::None),
+        useGPU(false), clientParametersFuncName(llvm::None),
        optimizerConfig(optimizer::DEFAULT_CONFIG){};

  CompilationOptions(std::string funcname) : CompilationOptions() {
--- a/compiler/include/concretelang/Support/Pipeline.h
+++ b/compiler/include/concretelang/Support/Pipeline.h
@@ -57,6 +57,10 @@ mlir::LogicalResult asyncOffload(mlir::MLIRContext &context,
                                 mlir::ModuleOp &module,
                                 std::function<bool(mlir::Pass *)> enablePass);

+mlir::LogicalResult
+transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module,
+                        std::function<bool(mlir::Pass *)> enablePass);
+
 mlir::LogicalResult
 lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module,
                    std::function<bool(mlir::Pass *)> enablePass);
--- a/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp
+++ b/compiler/lib/Conversion/BConcreteToCAPI/BConcreteToCAPI.cpp
@@ -0,0 +1,119 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Transforms/DialectConversion.h>
+
+#include "concretelang/Conversion/Passes.h"
+#include "concretelang/Conversion/Tools.h"
+#include "concretelang/Dialect/BConcrete/IR/BConcreteDialect.h"
+#include "concretelang/Dialect/BConcrete/IR/BConcreteOps.h"
+
+char move_bsk_to_gpu[] = "move_bsk_to_gpu";
+char free_from_gpu[] = "free_from_gpu";
+
+/// \brief Rewrites `BConcrete.move_bsk_to_gpu` into a CAPI call to
+/// `move_bsk_to_gpu`
+///
+/// Also insert the forward declaration of `move_bsk_to_gpu`
+struct MoveBskOpPattern : public mlir::OpRewritePattern<
+                              mlir::concretelang::BConcrete::MoveBskToGPUOp> {
+  MoveBskOpPattern(::mlir::MLIRContext *context,
+                   mlir::PatternBenefit benefit = 1)
+      : ::mlir::OpRewritePattern<mlir::concretelang::BConcrete::MoveBskToGPUOp>(
+            context, benefit) {}
+
+  ::mlir::LogicalResult
+  matchAndRewrite(mlir::concretelang::BConcrete::MoveBskToGPUOp moveBskOp,
+                  ::mlir::PatternRewriter &rewriter) const override {
+
+    auto ctx = getContextArgument(moveBskOp);
+
+    mlir::SmallVector<mlir::Value> operands{ctx};
+
+    // Insert forward declaration of the function
+    auto contextType =
+        mlir::concretelang::Concrete::ContextType::get(rewriter.getContext());
+    auto funcType = mlir::FunctionType::get(
+        rewriter.getContext(), {contextType},
+        {mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type())});
+    if (insertForwardDeclaration(moveBskOp, rewriter, move_bsk_to_gpu, funcType)
+            .failed()) {
+      return mlir::failure();
+    }
+
+    rewriter.replaceOpWithNewOp<mlir::func::CallOp>(
+        moveBskOp, move_bsk_to_gpu, moveBskOp.getResult().getType(), operands);
+
+    return ::mlir::success();
+  };
+};
+
+/// \brief Rewrites `BConcrete.free_bsk_from_gpu` into a CAPI call to
+/// `free_from_gpu`
+///
+/// Also insert the forward declaration of `free_from_gpu`
+struct FreeBskOpPattern : public mlir::OpRewritePattern<
+                              mlir::concretelang::BConcrete::FreeBskFromGPUOp> {
+  FreeBskOpPattern(::mlir::MLIRContext *context,
+                   mlir::PatternBenefit benefit = 1)
+      : ::mlir::OpRewritePattern<
+            mlir::concretelang::BConcrete::FreeBskFromGPUOp>(context, benefit) {
+  }
+
+  ::mlir::LogicalResult
+  matchAndRewrite(mlir::concretelang::BConcrete::FreeBskFromGPUOp freeBskOp,
+                  ::mlir::PatternRewriter &rewriter) const override {
+
+    mlir::SmallVector<mlir::Value> operands{freeBskOp.bsk()};
+
+    // Insert forward declaration of the function
+    auto funcType = mlir::FunctionType::get(
+        rewriter.getContext(),
+        {mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type())}, {});
+    if (insertForwardDeclaration(freeBskOp, rewriter, free_from_gpu, funcType)
+            .failed()) {
+      return mlir::failure();
+    }
+
+    rewriter.replaceOpWithNewOp<mlir::func::CallOp>(
+        freeBskOp, free_from_gpu, mlir::TypeRange({}), operands);
+
+    return ::mlir::success();
+  };
+};
+
+namespace {
+struct BConcreteToCAPIPass : public BConcreteToCAPIBase<BConcreteToCAPIPass> {
+  void runOnOperation() final;
+};
+} // namespace
+
+void BConcreteToCAPIPass::runOnOperation() {
+  auto op = this->getOperation();
+
+  mlir::ConversionTarget target(getContext());
+  mlir::RewritePatternSet patterns(&getContext());
+
+  target.addIllegalOp<mlir::concretelang::BConcrete::MoveBskToGPUOp>();
+  target.addLegalDialect<mlir::func::FuncDialect>();
+
+  patterns.insert<MoveBskOpPattern>(&getContext());
+  patterns.insert<FreeBskOpPattern>(&getContext());
+
+  // Apply conversion
+  if (mlir::applyPartialConversion(op, target, std::move(patterns)).failed()) {
+    this->signalPassFailure();
+  }
+}
+
+namespace mlir {
+namespace concretelang {
+std::unique_ptr<OperationPass<ModuleOp>> createConvertBConcreteToCAPIPass() {
+  return std::make_unique<BConcreteToCAPIPass>();
+}
+} // namespace concretelang
+} // namespace mlir
--- a/compiler/lib/Conversion/BConcreteToCAPI/CMakeLists.txt
+++ b/compiler/lib/Conversion/BConcreteToCAPI/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_dialect_library(BConcreteToCAPI
+  BConcreteToCAPI.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/BConcrete
+
+  DEPENDS
+  BConcreteDialect
+  mlir-headers
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRTransforms)
+
+target_link_libraries(BConcreteToCAPI PUBLIC BConcreteDialect MLIRIR)
--- a/compiler/lib/Conversion/CMakeLists.txt
+++ b/compiler/lib/Conversion/CMakeLists.txt
@@ -3,6 +3,8 @@ add_subdirectory(TFHEGlobalParametrization)
 add_subdirectory(TFHEToConcrete)
 add_subdirectory(FHETensorOpsToLinalg)
 add_subdirectory(ConcreteToBConcrete)
+add_subdirectory(ConcreteToGPU)
+add_subdirectory(BConcreteToCAPI)
 add_subdirectory(MLIRLowerableDialectsToLLVM)
 add_subdirectory(LinalgExtras)

--- a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
+++ b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
@@ -9,6 +9,7 @@
 #include <mlir/Dialect/Affine/IR/AffineOps.h>
 #include <mlir/Dialect/Bufferization/IR/Bufferization.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
 #include <mlir/Dialect/Linalg/IR/Linalg.h>
 #include <mlir/Dialect/SCF/IR/SCF.h>
 #include <mlir/Dialect/Tensor/IR/Tensor.h>
@@ -64,6 +65,10 @@ class ConcreteToBConcreteTypeConverter : public mlir::TypeConverter {
 public:
  ConcreteToBConcreteTypeConverter() {
    addConversion([](mlir::Type type) { return type; });
+    addConversion([&](mlir::concretelang::Concrete::GPUBskType type) {
+      return mlir::LLVM::LLVMPointerType::get(
+          mlir::IntegerType::get(type.getContext(), 64));
+    });
    addConversion([&](mlir::concretelang::Concrete::PlaintextType type) {
      return mlir::IntegerType::get(type.getContext(), 64);
    });
@@ -160,28 +165,34 @@ struct LowToBConcrete : public mlir::OpRewritePattern<ConcreteOp> {
  matchAndRewrite(ConcreteOp concreteOp,
                  ::mlir::PatternRewriter &rewriter) const override {
    ConcreteToBConcreteTypeConverter converter;
-    mlir::concretelang::Concrete::LweCiphertextType resultTy =
-        ((mlir::Type)concreteOp->getResult(0).getType())
-            .cast<mlir::concretelang::Concrete::LweCiphertextType>();
-    auto newResultTy =
-        converter.convertType(resultTy).cast<mlir::RankedTensorType>();
+    mlir::TypeRange resultTyRange = concreteOp->getResultTypes();

    llvm::ArrayRef<::mlir::NamedAttribute> attributes =
        concreteOp.getOperation()->getAttrs();

-    auto crt = resultTy.getCrtDecomposition();
    mlir::Operation *bConcreteOp;
-    if (crt.empty()) {
-      bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteOp>(
-          concreteOp, newResultTy, concreteOp.getOperation()->getOperands(),
-          attributes);
+    if (resultTyRange.size() == 1 &&
+        resultTyRange.front()
+            .isa<mlir::concretelang::Concrete::LweCiphertextType>()) {
+      auto crt = resultTyRange.front()
+                     .cast<mlir::concretelang::Concrete::LweCiphertextType>()
+                     .getCrtDecomposition();
+      if (crt.empty()) {
+        bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteOp>(
+            concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(),
+            attributes);
+      } else {
+        auto newAttributes = attributes.vec();
+        newAttributes.push_back(rewriter.getNamedAttr(
+            "crtDecomposition", rewriter.getI64ArrayAttr(crt)));
+        bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteCRTOp>(
+            concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(),
+            newAttributes);
+      }
    } else {
-      auto newAttributes = attributes.vec();
-      newAttributes.push_back(rewriter.getNamedAttr(
-          "crtDecomposition", rewriter.getI64ArrayAttr(crt)));
-      bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteCRTOp>(
-          concreteOp, newResultTy, concreteOp.getOperation()->getOperands(),
-          newAttributes);
+      bConcreteOp = rewriter.replaceOpWithNewOp<BConcreteOp>(
+          concreteOp, resultTyRange, concreteOp.getOperation()->getOperands(),
+          attributes);
    }

    mlir::concretelang::convertOperandAndResultTypes(
@@ -906,7 +917,16 @@ void ConcreteToBConcretePass::runOnOperation() {
                       mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
        LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweOp,
                       mlir::concretelang::BConcrete::BootstrapLweBufferOp,
-                       mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
+                       mlir::concretelang::BConcrete::BootstrapLweBufferOp>,
+        LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweGPUOp,
+                       mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
+                       mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>,
+        LowToBConcrete<mlir::concretelang::Concrete::MoveBskToGPUOp,
+                       mlir::concretelang::BConcrete::MoveBskToGPUOp,
+                       mlir::concretelang::BConcrete::MoveBskToGPUOp>,
+        LowToBConcrete<mlir::concretelang::Concrete::FreeBskFromGPUOp,
+                       mlir::concretelang::BConcrete::FreeBskFromGPUOp,
+                       mlir::concretelang::BConcrete::FreeBskFromGPUOp>,
        LowToBConcrete<Concrete::WopPBSLweOp, BConcrete::WopPBSCRTLweBufferOp,
                       BConcrete::WopPBSCRTLweBufferOp>>(&getContext());

--- a/compiler/lib/Conversion/ConcreteToGPU/CMakeLists.txt
+++ b/compiler/lib/Conversion/ConcreteToGPU/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_mlir_dialect_library(ConcreteToGPU
+  ConcreteToGPU.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/Concrete
+
+  DEPENDS
+  ConcreteDialect
+  mlir-headers
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRTransforms
+  )
+
+target_link_libraries(ConcreteToGPU PUBLIC ConcreteDialect MLIRIR)
--- a/compiler/lib/Conversion/ConcreteToGPU/ConcreteToGPU.cpp
+++ b/compiler/lib/Conversion/ConcreteToGPU/ConcreteToGPU.cpp
@@ -0,0 +1,108 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#include <mlir/Pass/Pass.h>
+#include <mlir/Transforms/DialectConversion.h>
+
+#include "concretelang/Conversion/Passes.h"
+#include "concretelang/Dialect/Concrete/IR/ConcreteDialect.h"
+#include "concretelang/Dialect/Concrete/IR/ConcreteOps.h"
+#include "concretelang/Dialect/Concrete/IR/ConcreteTypes.h"
+
+/// This rewrite pattern transforms any instance of `Concrete.bootstrap_lwe`
+/// into `Concrete.bootstrap_lwe_gpu`. It also inserts operations to allocate
+/// memory, copy bsk into GPU, and free memory after bootstrapping.
+struct BstOpPattern : public mlir::OpRewritePattern<
+                          mlir::concretelang::Concrete::BootstrapLweOp> {
+  BstOpPattern(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1)
+      : ::mlir::OpRewritePattern<mlir::concretelang::Concrete::BootstrapLweOp>(
+            context, benefit) {}
+
+  ::mlir::LogicalResult
+  matchAndRewrite(mlir::concretelang::Concrete::BootstrapLweOp bstOp,
+                  ::mlir::PatternRewriter &rewriter) const override {
+
+    auto baselog = bstOp.baseLog();
+    auto level = bstOp.level();
+    mlir::Value ct = bstOp.input_ciphertext();
+
+    auto ctType =
+        ct.getType().cast<mlir::concretelang::Concrete::LweCiphertextType>();
+    auto inputLweDim = ctType.getDimension();
+
+    auto outType = bstOp.getResult()
+                       .getType()
+                       .cast<mlir::concretelang::Concrete::LweCiphertextType>();
+    auto outputLweDim = outType.getDimension();
+
+    // copy bsk into GPU
+    mlir::Value bskGPU =
+        rewriter
+            .create<mlir::concretelang::Concrete::MoveBskToGPUOp>(
+                bstOp.getLoc(), mlir::concretelang::Concrete::GPUBskType::get(
+                                    rewriter.getContext()))
+            .getResult();
+
+    mlir::Value inputLweDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        bstOp.getLoc(), inputLweDim, 32);
+    mlir::Value polySizeCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        bstOp.getLoc(), outputLweDim, 32);
+    mlir::Value levelCst =
+        rewriter.create<mlir::arith::ConstantIntOp>(bstOp.getLoc(), level, 32);
+    mlir::Value baselogCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        bstOp.getLoc(), baselog, 32);
+
+    mlir::Type tableType =
+        mlir::RankedTensorType::get({4}, rewriter.getI64Type());
+    mlir::Value tableCst = rewriter.create<mlir::arith::ConstantOp>(
+        bstOp.getLoc(),
+        mlir::DenseIntElementsAttr::get(
+            tableType, {llvm::APInt(64, 0), llvm::APInt(64, 0),
+                        llvm::APInt(64, 0), llvm::APInt(64, 0)}));
+
+    rewriter
+        .replaceOpWithNewOp<mlir::concretelang::Concrete::BootstrapLweGPUOp>(
+            bstOp, outType, ct, tableCst, inputLweDimCst, polySizeCst, levelCst,
+            baselogCst, bskGPU);
+
+    // free bsk memory from GPU
+    rewriter.create<mlir::concretelang::Concrete::FreeBskFromGPUOp>(
+        bstOp.getLoc(), bskGPU);
+
+    return ::mlir::success();
+  };
+};
+
+namespace {
+struct ConcreteToGPUPass : public ConcreteToGPUBase<ConcreteToGPUPass> {
+  void runOnOperation() final;
+};
+} // namespace
+
+void ConcreteToGPUPass::runOnOperation() {
+  auto op = this->getOperation();
+
+  mlir::ConversionTarget target(getContext());
+  mlir::RewritePatternSet patterns(&getContext());
+
+  target.addLegalDialect<mlir::concretelang::Concrete::ConcreteDialect,
+                         mlir::arith::ArithmeticDialect>();
+  target.addIllegalOp<mlir::concretelang::Concrete::BootstrapLweOp>();
+
+  patterns.insert<BstOpPattern>(&getContext());
+
+  // Apply conversion
+  if (mlir::applyPartialConversion(op, target, std::move(patterns)).failed()) {
+    this->signalPassFailure();
+  }
+}
+
+namespace mlir {
+namespace concretelang {
+std::unique_ptr<OperationPass<ModuleOp>> createConvertConcreteToGPUPass() {
+  return std::make_unique<ConcreteToGPUPass>();
+}
+} // namespace concretelang
+} // namespace mlir
--- a/compiler/lib/Conversion/Tools.cpp
+++ b/compiler/lib/Conversion/Tools.cpp
@@ -6,6 +6,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"

 #include "concretelang/Conversion/Tools.h"
+#include "concretelang/Dialect/Concrete/IR/ConcreteTypes.h"

 mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
                                             mlir::OpBuilder &rewriter,
@@ -35,3 +36,27 @@ mlir::LogicalResult insertForwardDeclaration(mlir::Operation *op,
      mlir::SymbolTable::lookupSymbolIn(module, funcName)));
  return mlir::success();
 }
+
+/// Returns the value of the context argument from the enclosing func
+mlir::Value getContextArgument(mlir::Operation *op) {
+  mlir::Block *block = op->getBlock();
+  while (block != nullptr) {
+    if (llvm::isa<mlir::func::FuncOp>(block->getParentOp())) {
+
+      auto context = std::find_if(
+          block->getArguments().rbegin(), block->getArguments().rend(),
+          [](mlir::BlockArgument &arg) {
+            return arg.getType()
+                .isa<mlir::concretelang::Concrete::ContextType>();
+          });
+
+      assert(context != block->getArguments().rend() &&
+             "Cannot find the Concrete.context");
+
+      return *context;
+    }
+    block = block->getParentOp()->getBlock();
+  }
+  assert("can't find a function that enclose the op");
+  return nullptr;
+}
--- a/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -7,6 +7,7 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -75,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64";
 char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64";
 char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
 char memref_await_future[] = "memref_await_future";
+char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
 char memref_expand_lut_in_trivial_glwe_ct_u64[] =
    "memref_expand_lut_in_trivial_glwe_ct_u64";

@@ -89,6 +91,9 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
      mlir::concretelang::RT::FutureType::get(rewriter.getIndexType());
  auto contextType =
      mlir::concretelang::Concrete::ContextType::get(rewriter.getContext());
+  auto i32Type = rewriter.getI32Type();
+  auto i64PointerType = mlir::LLVM::LLVMPointerType::get(rewriter.getI64Type());
+
  mlir::FunctionType funcType;

  if (funcName == memref_add_lwe_ciphertexts_u64) {
@@ -124,6 +129,12 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
    funcType = mlir::FunctionType::get(
        rewriter.getContext(),
        {memref1DType, futureType, memref1DType, memref1DType}, {});
+  } else if (funcName == memref_bootstrap_lwe_cuda_u64) {
+    funcType = mlir::FunctionType::get(rewriter.getContext(),
+                                       {memref1DType, memref1DType,
+                                        memref1DType, i32Type, i32Type, i32Type,
+                                        i32Type, i64PointerType},
+                                       {});
  } else if (funcName == memref_expand_lut_in_trivial_glwe_ct_u64) {
    funcType = mlir::FunctionType::get(rewriter.getContext(),
                                       {
@@ -156,32 +167,6 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
  return insertForwardDeclaration(op, rewriter, funcName, funcType);
 }

-/// Returns the value of the context argument from the enclosing func
-mlir::Value getContextArgument(mlir::Operation *op) {
-  mlir::Block *block = op->getBlock();
-  while (block != nullptr) {
-    if (llvm::isa<mlir::func::FuncOp>(block->getParentOp())) {
-      block = &mlir::cast<mlir::func::FuncOp>(block->getParentOp())
-                   .getBody()
-                   .front();
-
-      auto context =
-          std::find_if(block->getArguments().rbegin(),
-                       block->getArguments().rend(), [](BlockArgument &arg) {
-                         return arg.getType()
-                             .isa<mlir::concretelang::Concrete::ContextType>();
-                       });
-      assert(context != block->getArguments().rend() &&
-             "Cannot find the Concrete.context");
-
-      return *context;
-    }
-    block = block->getParentOp()->getBlock();
-  }
-  assert("can't find a function that enclose the op");
-  return nullptr;
-};
-
 template <typename Op>
 void pushAdditionalArgs(Op op, mlir::SmallVector<mlir::Value> &operands,
                        RewriterBase &rewriter);
@@ -578,6 +563,10 @@ void mlir::concretelang::BConcrete::
        BufferizableWithCallOpInterface<BConcrete::NegateLweBufferOp,
                                        memref_negate_lwe_ciphertext_u64>>(
        *ctx);
+    BConcrete::BootstrapLweGPUBufferOp::attachInterface<
+        BufferizableWithCallOpInterface<BConcrete::BootstrapLweGPUBufferOp,
+                                        memref_bootstrap_lwe_cuda_u64, false>>(
+        *ctx);
    BConcrete::KeySwitchLweBufferOp::attachInterface<
        BufferizableWithCallOpInterface<BConcrete::KeySwitchLweBufferOp,
                                        memref_keyswitch_lwe_u64>>(*ctx);
--- a/compiler/lib/Runtime/CMakeLists.txt
+++ b/compiler/lib/Runtime/CMakeLists.txt
@@ -18,6 +18,16 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
  )
 endif()

+if(CONCRETELANG_CUDA_SUPPORT)
+  target_link_libraries(
+    ConcretelangRuntime
+    PRIVATE
+    ConcreteCUDA
+    -L/usr/local/cuda/lib64
+    cudart
+  )
+endif()
+
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
  target_link_libraries(ConcretelangRuntime PUBLIC omp)
 else()
--- a/compiler/lib/Runtime/wrappers.cpp
+++ b/compiler/lib/Runtime/wrappers.cpp
@@ -57,6 +57,122 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size,
 #include "concretelang/ClientLib/CRT.h"
 #include "concretelang/Runtime/wrappers.h"

+#ifdef CONCRETELANG_CUDA_SUPPORT
+
+// We need to define the double2 struct from the CUDA backend header files
+// This shouldn't be defined here, but included along with concrete-cuda header
+// files
+typedef struct double2 {
+  double x, y;
+} double2;
+
+#include "bootstrap.h"
+#include "device.h"
+
+void memref_keyswitch_lwe_cuda_u64(uint64_t *out_allocated,
+                                   uint64_t *out_aligned, uint64_t out_offset,
+                                   uint64_t out_size, uint64_t out_stride,
+                                   uint64_t *ct0_allocated,
+                                   uint64_t *ct0_aligned, uint64_t ct0_offset,
+                                   uint64_t ct0_size, uint64_t ct0_stride,
+                                   void *ksk_gpu) {
+  // TODO: GPU implementation
+}
+
+void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
+                     uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
+                     uint32_t gpu_idx) {
+  void *stream = cuda_create_stream(gpu_idx);
+  void *ct_gpu = cuda_malloc(ct_size * sizeof(uint64_t), gpu_idx);
+  cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset,
+                           ct_size * sizeof(uint64_t), stream, gpu_idx);
+  cuda_synchronize_device(gpu_idx);
+  cuda_destroy_stream(stream, gpu_idx);
+  return ct_gpu;
+}
+
+void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                      uint32_t gpu_idx = 0) {
+  void *stream = cuda_create_stream(gpu_idx);
+  LweBootstrapKey_u64 *bsk = get_bootstrap_key_u64(context);
+  BufferView bskBuffer = bootstrap_buffer_lwe_u64(bsk);
+  void *bsk_gpu = cuda_malloc(bskBuffer.length, gpu_idx);
+  cuda_memcpy_async_to_gpu(bsk_gpu, (void *)bskBuffer.pointer, bskBuffer.length,
+                           stream, gpu_idx);
+  cuda_synchronize_device(gpu_idx);
+  cuda_destroy_stream(stream, gpu_idx);
+  return bsk_gpu;
+}
+
+void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
+                    uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
+                    void *ct_gpu, size_t size, uint32_t gpu_idx) {
+  void *stream = cuda_create_stream(gpu_idx);
+  cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
+                           size * sizeof(uint64_t), stream, gpu_idx);
+  cuda_synchronize_device(gpu_idx);
+  cuda_destroy_stream(stream, gpu_idx);
+}
+
+void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
+  cuda_drop(gpu_ptr, gpu_idx);
+}
+
+void memref_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
+    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
+    uint64_t ct0_stride, uint64_t *tlu_allocated, uint64_t *tlu_aligned,
+    uint64_t tlu_offset, uint64_t tlu_size, uint64_t tlu_stride,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, void *bsk_gpu) {
+
+  uint32_t gpu_idx = 0;
+  void *stream = cuda_create_stream(gpu_idx);
+
+  // move input ciphertext into gpu
+  void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
+                                 ct0_size, ct0_stride, gpu_idx);
+  // move output ciphertext into gpu
+  void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset,
+                                 out_size, out_stride, gpu_idx);
+  // hardcoded values
+  uint32_t num_samples = 1, num_test_vectors = 1, lwe_idx = 0;
+  void *test_vector_idxes = malloc(num_samples * sizeof(uint32_t));
+  ((uint32_t *)test_vector_idxes)[0] = 0;
+  void *test_vector = malloc(poly_size * sizeof(uint64_t));
+  for (size_t i = 0; i < poly_size; i++) {
+    ((uint64_t *)test_vector)[i] = (uint64_t)1 << 61;
+  }
+  // move test vector into gpu
+  void *test_vector_gpu = cuda_malloc(poly_size * sizeof(uint64_t), gpu_idx);
+  cuda_memcpy_async_to_gpu(test_vector_gpu, test_vector,
+                           poly_size * sizeof(uint64_t), stream, gpu_idx);
+  // move test vector indexes into gpu
+  void *test_vector_idxes_gpu =
+      cuda_malloc(num_samples * sizeof(uint32_t), gpu_idx);
+  cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
+                           num_samples * sizeof(uint32_t), stream, gpu_idx);
+  // run gpu bootstrap
+  cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+      stream, out_gpu, test_vector_gpu, test_vector_idxes_gpu, ct0_gpu, bsk_gpu,
+      input_lwe_dim, poly_size, base_log, level, num_samples, num_test_vectors,
+      lwe_idx, cuda_get_max_shared_memory(gpu_idx));
+  // copy output ciphertext back to cpu
+  move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride,
+                 out_gpu, out_size, gpu_idx);
+  cuda_synchronize_device(gpu_idx);
+  // free memory that we allocated on gpu
+  cuda_drop(ct0_gpu, gpu_idx);
+  cuda_drop(out_gpu, gpu_idx);
+  cuda_drop(test_vector_gpu, gpu_idx);
+  cuda_drop(test_vector_idxes_gpu, gpu_idx);
+
+  cuda_destroy_stream(stream, gpu_idx);
+}
+
+#endif
+
 void memref_expand_lut_in_trivial_glwe_ct_u64(
    uint64_t *glwe_ct_allocated, uint64_t *glwe_ct_aligned,
    uint64_t glwe_ct_offset, uint64_t glwe_ct_size, uint64_t glwe_ct_stride,
--- a/compiler/lib/Support/CompilerEngine.cpp
+++ b/compiler/lib/Support/CompilerEngine.cpp
@@ -317,6 +317,14 @@ CompilerEngine::compile(llvm::SourceMgr &sm, Target target, OptionalLib lib) {
    return errorDiag("Optimizing Concrete failed");
  }

+  // Transforming into GPU
+  if (this->compilerOptions.useGPU &&
+      mlir::concretelang::pipeline::transformsConcreteToGPU(mlirContext, module,
+                                                            this->enablePass)
+          .failed()) {
+    return errorDiag("Transforming Concrete to GPU failed");
+  }
+
  if (target == Target::CONCRETE)
    return std::move(res);

--- a/compiler/lib/Support/Pipeline.cpp
+++ b/compiler/lib/Support/Pipeline.cpp
@@ -239,6 +239,16 @@ optimizeConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
  return pm.run(module.getOperation());
 }

+mlir::LogicalResult
+transformsConcreteToGPU(mlir::MLIRContext &context, mlir::ModuleOp &module,
+                        std::function<bool(mlir::Pass *)> enablePass) {
+  mlir::PassManager pm(&context);
+  pipelinePrinting("ConcreteToGPU", pm, context);
+  addPotentiallyNestedPass(
+      pm, mlir::concretelang::createConvertConcreteToGPUPass(), enablePass);
+  return pm.run(module.getOperation());
+}
+
 mlir::LogicalResult
 lowerConcreteToBConcrete(mlir::MLIRContext &context, mlir::ModuleOp &module,
                         std::function<bool(mlir::Pass *)> enablePass,
@@ -283,6 +293,8 @@ lowerBConcreteToStd(mlir::MLIRContext &context, mlir::ModuleOp &module,
                           enablePass);
  addPotentiallyNestedPass(pm, mlir::concretelang::createAddRuntimeContext(),
                           enablePass);
+  addPotentiallyNestedPass(
+      pm, mlir::concretelang::createConvertBConcreteToCAPIPass(), enablePass);
  return pm.run(module.getOperation());
 }

--- a/compiler/src/main.cpp
+++ b/compiler/src/main.cpp
@@ -98,6 +98,12 @@ llvm::cl::opt<bool>
                                    "dialects. (Enabled by default)"),
                     llvm::cl::init<bool>(true));

+llvm::cl::opt<bool>
+    useGPU("use-gpu",
+           llvm::cl::desc("enable/disable generating concrete GPU "
+                          "operations (Disabled by default)"),
+           llvm::cl::init<bool>(false));
+
 llvm::cl::list<std::string> passes(
    "passes",
    llvm::cl::desc("Specify the passes to run (use only for compiler tests)"),
@@ -283,6 +289,7 @@ cmdlineCompilationOptions() {
  options.loopParallelize = cmdline::loopParallelize;
  options.dataflowParallelize = cmdline::dataflowParallelize;
  options.optimizeConcrete = cmdline::optimizeConcrete;
+  options.useGPU = cmdline::useGPU;

  if (!cmdline::v0Constraint.empty()) {
    if (cmdline::v0Constraint.size() != 2) {