feat: support GPU keyswitching

2026-02-09 03:55:04 -05:00 · 2022-09-20 09:25:26 +01:00
parent 7e08614e6c
commit d615ff47f2
6 changed files with 245 additions and 50 deletions
--- a/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
+++ b/compiler/include/concretelang/Dialect/BConcrete/IR/BConcreteOps.td
@@ -170,4 +170,18 @@ def BConcrete_BootstrapLweGPUBufferOp : BConcrete_Op<"bootstrap_lwe_gpu_buffer">
    let results = (outs 1DTensorOf<[I64]>:$result);
 }

+// This is a different op in BConcrete just because of the way we are lowering to CAPI
+// When the CAPI lowering is detached from bufferization, we can remove this op, and lower
+// to the appropriate CAPI (gpu or cpu) depending on the useGPU compilation option
+def BConcrete_KeySwitchLweGPUBufferOp : BConcrete_Op<"keyswitch_lwe_gpu_buffer"> {
+    let arguments = (ins
+        1DTensorOf<[I64]>:$ciphertext,
+        I32:$level,
+        I32:$baseLog,
+        I32:$lwe_dim_in,
+        I32:$lwe_dim_out
+    );
+    let results = (outs 1DTensorOf<[I64]>:$result);
+}
+
 #endif
--- a/compiler/include/concretelang/Runtime/wrappers.h
+++ b/compiler/include/concretelang/Runtime/wrappers.h
@@ -129,8 +129,7 @@ void memref_copy_one_rank(uint64_t *src_allocated, uint64_t *src_aligned,
 /// \brief Run bootstrapping on GPU.
 ///
 /// It handles memory copy of the different arguments from CPU to GPU, and
-/// freeing memory, except for the bootstrapping key, which should already be in
-/// GPU.
+/// freeing memory.
 ///
 /// \param out_allocated
 /// \param out_aligned
@@ -164,7 +163,35 @@ void memref_bootstrap_lwe_cuda_u64(
    uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
    mlir::concretelang::RuntimeContext *context);

-/// \brief Copy ciphertext from CPU to GPU using a single stream.
+/// \brief Run Keyswitch on GPU.
+///
+/// It handles memory copy of the different arguments from CPU to GPU, and
+/// freeing memory.
+///
+/// \param out_allocated
+/// \param out_aligned
+/// \param out_offset
+/// \param out_size
+/// \param out_stride
+/// \param ct0_allocated
+/// \param ct0_aligned
+/// \param ct0_offset
+/// \param ct0_size
+/// \param ct0_stride
+/// \param level
+/// \param base_log
+/// \param input_lwe_dim LWE input dimension
+/// \param output_lwe_dim LWE output dimension
+/// \param context
+void memref_keyswitch_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
+    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
+    uint64_t ct0_stride, uint32_t level, uint32_t base_log,
+    uint32_t input_lwe_dim, uint32_t output_lwe_dim,
+    mlir::concretelang::RuntimeContext *context);
+
+/// \brief Copy ciphertext from CPU to GPU.
 ///
 /// It handles memory allocation on GPU.
 ///
@@ -174,12 +201,14 @@ void memref_bootstrap_lwe_cuda_u64(
 /// \param ct_size
 /// \param ct_stride
 /// \param gpu_idx index of the GPU to use
+/// \param stream cuda stream to use for the copy
 /// \return void* pointer to the GPU ciphertext
-void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
-                     uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
-                     uint32_t gpu_idx);
+void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
+                             uint64_t ct_offset, uint64_t ct_size,
+                             uint64_t ct_stride, uint32_t gpu_idx,
+                             void *stream);

-/// \brief Copy ciphertext from GPU to CPU using a single stream.
+/// \brief Copy ciphertext from GPU to CPU.
 ///
 /// Memory on GPU won't be freed after the copy.
 ///
@@ -191,11 +220,13 @@ void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
 /// \param ct_gpu
 /// \param size
 /// \param gpu_idx index of the GPU to use
-void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
-                    uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
-                    void *ct_gpu, size_t size, uint32_t gpu_idx);
+/// \param stream cuda stream to use for the copy
+void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
+                            uint64_t out_offset, uint64_t out_size,
+                            uint64_t out_stride, void *ct_gpu, size_t size,
+                            uint32_t gpu_idx, void *stream);

-/// \brief Copy bootstrapping key from CPU to GPU using a single stream.
+/// \brief Copy bootstrapping key from CPU to GPU.
 ///
 /// It handles memory allocation on GPU, as well as conversion to the Fourier
 /// domain.
@@ -206,10 +237,28 @@ void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
 /// \param level
 /// \param glwe_dim
 /// \param gpu_idx index of the GPU to use
+/// \param stream cuda stream to use for the copy
 /// \return void*  pointer to the GPU bsk
-void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
-                      uint32_t input_lwe_dim, uint32_t poly_size,
-                      uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx);
+void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                              uint32_t input_lwe_dim, uint32_t poly_size,
+                              uint32_t level, uint32_t glwe_dim,
+                              uint32_t gpu_idx, void *stream);
+
+/// \brief Copy keyswitching key from CPU to GPU.
+///
+/// It handles memory allocation on GPU.
+///
+/// \param context
+/// \param level
+/// \param input_lwe_dim
+/// \param output_lwe_dim
+/// \param gpu_idx index of the GPU to use
+/// \param stream cuda stream to use for the copy
+/// \return void*  pointer to the GPU ksk
+void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                              uint32_t level, uint32_t input_lwe_dim,
+                              uint32_t output_lwe_dim, uint32_t gpu_idx,
+                              void *stream);

 /// \brief Free gpu memory.
 ///
--- a/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
+++ b/compiler/lib/Conversion/ConcreteToBConcrete/ConcreteToBConcrete.cpp
@@ -201,6 +201,50 @@ struct LowToBConcrete : public mlir::OpRewritePattern<ConcreteOp> {
  };
 };

+struct KeySwitchToGPU : public mlir::OpRewritePattern<
+                            mlir::concretelang::Concrete::KeySwitchLweOp> {
+  KeySwitchToGPU(::mlir::MLIRContext *context, mlir::PatternBenefit benefit = 1)
+      : ::mlir::OpRewritePattern<mlir::concretelang::Concrete::KeySwitchLweOp>(
+            context, benefit) {}
+
+  ::mlir::LogicalResult
+  matchAndRewrite(mlir::concretelang::Concrete::KeySwitchLweOp keySwitchOp,
+                  ::mlir::PatternRewriter &rewriter) const override {
+    ConcreteToBConcreteTypeConverter converter;
+
+    mlir::Value levelCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        keySwitchOp.getLoc(), keySwitchOp.level(), 32);
+    mlir::Value baseLogCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        keySwitchOp.getLoc(), keySwitchOp.baseLog(), 32);
+
+    // construct operands for in/out dimensions
+    mlir::concretelang::Concrete::LweCiphertextType outType =
+        keySwitchOp.getType();
+    auto outDim = outType.getDimension();
+    mlir::Value outDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        keySwitchOp.getLoc(), outDim, 32);
+    auto inputType =
+        keySwitchOp.ciphertext()
+            .getType()
+            .cast<mlir::concretelang::Concrete::LweCiphertextType>();
+    auto inputDim = inputType.getDimension();
+    mlir::Value inputDimCst = rewriter.create<mlir::arith::ConstantIntOp>(
+        keySwitchOp.getLoc(), inputDim, 32);
+
+    mlir::Operation *bKeySwitchGPUOp = rewriter.replaceOpWithNewOp<
+        mlir::concretelang::BConcrete::KeySwitchLweGPUBufferOp>(
+        keySwitchOp, outType, keySwitchOp.ciphertext(), levelCst, baseLogCst,
+        inputDimCst, outDimCst);
+
+    mlir::concretelang::convertOperandAndResultTypes(
+        rewriter, bKeySwitchGPUOp, [&](mlir::MLIRContext *, mlir::Type t) {
+          return converter.convertType(t);
+        });
+
+    return ::mlir::success();
+  };
+};
+
 struct AddPlaintextLweCiphertextOpPattern
    : public mlir::OpRewritePattern<Concrete::AddPlaintextLweCiphertextOp> {
  AddPlaintextLweCiphertextOpPattern(::mlir::MLIRContext *context,
@@ -872,23 +916,24 @@ void ConcreteToBConcretePass::runOnOperation() {
        LowToBConcrete<mlir::concretelang::Concrete::NegateLweCiphertextOp,
                       mlir::concretelang::BConcrete::NegateLweBufferOp,
                       BConcrete::NegateCRTLweBufferOp>,
-        LowToBConcrete<mlir::concretelang::Concrete::KeySwitchLweOp,
-                       mlir::concretelang::BConcrete::KeySwitchLweBufferOp,
-                       mlir::concretelang::BConcrete::KeySwitchLweBufferOp>,
        LowToBConcrete<Concrete::WopPBSLweOp, BConcrete::WopPBSCRTLweBufferOp,
                       BConcrete::WopPBSCRTLweBufferOp>>(&getContext());

    if (this->useGPU) {
-      patterns.insert<LowToBConcrete<
-          mlir::concretelang::Concrete::BootstrapLweOp,
-          mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
-          mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>>(
-          &getContext());
+      patterns
+          .insert<LowToBConcrete<
+                      mlir::concretelang::Concrete::BootstrapLweOp,
+                      mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp,
+                      mlir::concretelang::BConcrete::BootstrapLweGPUBufferOp>,
+                  KeySwitchToGPU>(&getContext());
    } else {
      patterns.insert<
          LowToBConcrete<mlir::concretelang::Concrete::BootstrapLweOp,
                         mlir::concretelang::BConcrete::BootstrapLweBufferOp,
-                         mlir::concretelang::BConcrete::BootstrapLweBufferOp>>(
+                         mlir::concretelang::BConcrete::BootstrapLweBufferOp>,
+          LowToBConcrete<mlir::concretelang::Concrete::KeySwitchLweOp,
+                         mlir::concretelang::BConcrete::KeySwitchLweBufferOp,
+                         mlir::concretelang::BConcrete::KeySwitchLweBufferOp>>(
          &getContext());
    }

--- a/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/compiler/lib/Dialect/BConcrete/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -76,6 +76,7 @@ char memref_bootstrap_lwe_u64[] = "memref_bootstrap_lwe_u64";
 char memref_keyswitch_async_lwe_u64[] = "memref_keyswitch_async_lwe_u64";
 char memref_bootstrap_async_lwe_u64[] = "memref_bootstrap_async_lwe_u64";
 char memref_await_future[] = "memref_await_future";
+char memref_keyswitch_lwe_cuda_u64[] = "memref_keyswitch_lwe_cuda_u64";
 char memref_bootstrap_lwe_cuda_u64[] = "memref_bootstrap_lwe_cuda_u64";
 char memref_expand_lut_in_trivial_glwe_ct_u64[] =
    "memref_expand_lut_in_trivial_glwe_ct_u64";
@@ -112,6 +113,11 @@ mlir::LogicalResult insertForwardDeclarationOfTheCAPI(
  } else if (funcName == memref_keyswitch_lwe_u64) {
    funcType = mlir::FunctionType::get(
        rewriter.getContext(), {memref1DType, memref1DType, contextType}, {});
+  } else if (funcName == memref_keyswitch_lwe_cuda_u64) {
+    funcType = mlir::FunctionType::get(rewriter.getContext(),
+                                       {memref1DType, memref1DType, i32Type,
+                                        i32Type, i32Type, i32Type, contextType},
+                                       {});
  } else if (funcName == memref_bootstrap_lwe_u64) {
    funcType = mlir::FunctionType::get(rewriter.getContext(),
                                       {memref1DType, memref1DType,
@@ -482,6 +488,10 @@ void mlir::concretelang::BConcrete::
        BufferizableWithCallOpInterface<BConcrete::NegateLweBufferOp,
                                        memref_negate_lwe_ciphertext_u64>>(
        *ctx);
+    BConcrete::KeySwitchLweGPUBufferOp::attachInterface<
+        BufferizableWithCallOpInterface<BConcrete::KeySwitchLweGPUBufferOp,
+                                        memref_keyswitch_lwe_cuda_u64, true>>(
+        *ctx);
    BConcrete::BootstrapLweGPUBufferOp::attachInterface<
        BufferizableWithCallOpInterface<BConcrete::BootstrapLweGPUBufferOp,
                                        memref_bootstrap_lwe_cuda_u64, true>>(
--- a/compiler/lib/Runtime/wrappers.cpp
+++ b/compiler/lib/Runtime/wrappers.cpp
@@ -65,32 +65,57 @@ typedef struct double2 {
 // From concrete-cuda
 #include "bootstrap.h"
 #include "device.h"
+#include "keyswitch.h"

 void memref_keyswitch_lwe_cuda_u64(
    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
    uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
    uint64_t *ct0_aligned, uint64_t ct0_offset, uint64_t ct0_size,
-    uint64_t ct0_stride, mlir::concretelang::RuntimeContext *context) {
-  // TODO: GPU implementation
+    uint64_t ct0_stride, uint32_t level, uint32_t base_log,
+    uint32_t input_lwe_dim, uint32_t output_lwe_dim,
+    mlir::concretelang::RuntimeContext *context) {
+  // we currently just use the first GPU, but this should be decided
+  // dynamically, or during compilation, in the future
+  uint32_t gpu_idx = 0;
+  uint32_t num_samples = 1;
+  void *stream = cuda_create_stream(gpu_idx);
+  // move input ciphertext into gpu
+  void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
+                                         ct0_size, ct0_stride, gpu_idx, stream);
+  // move output ciphertext into gpu
+  void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
+                                         out_size, out_stride, gpu_idx, stream);
+  void *ksk_gpu = memcpy_async_ksk_to_gpu(context, level, input_lwe_dim,
+                                          output_lwe_dim, gpu_idx, stream);
+  cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
+                                          input_lwe_dim, output_lwe_dim,
+                                          base_log, level, num_samples);
+  // copy output ciphertext back to cpu
+  memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
+                         out_stride, out_gpu, out_size, gpu_idx, stream);
+  cuda_synchronize_device(gpu_idx);
+  // free memory that we allocated on gpu
+  cuda_drop(ct0_gpu, gpu_idx);
+  cuda_drop(out_gpu, gpu_idx);
+  cuda_drop(ksk_gpu, gpu_idx);
+  cuda_destroy_stream(stream, gpu_idx);
 }

-void *move_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
-                     uint64_t ct_offset, uint64_t ct_size, uint64_t ct_stride,
-                     uint32_t gpu_idx) {
-  void *stream = cuda_create_stream(gpu_idx);
+void *memcpy_async_ct_to_gpu(uint64_t *ct_allocated, uint64_t *ct_aligned,
+                             uint64_t ct_offset, uint64_t ct_size,
+                             uint64_t ct_stride, uint32_t gpu_idx,
+                             void *stream) {
  size_t buf_size = ct_size * sizeof(uint64_t);
  void *ct_gpu = cuda_malloc(buf_size, gpu_idx);
  cuda_memcpy_async_to_gpu(ct_gpu, ct_aligned + ct_offset, buf_size, stream,
                           gpu_idx);
-  cuda_synchronize_device(gpu_idx);
-  cuda_destroy_stream(stream, gpu_idx);
  return ct_gpu;
 }

-void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
-                      uint32_t input_lwe_dim, uint32_t poly_size,
-                      uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx = 0) {
-  void *stream = cuda_create_stream(gpu_idx);
+void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                              uint32_t input_lwe_dim, uint32_t poly_size,
+                              uint32_t level, uint32_t glwe_dim,
+                              uint32_t gpu_idx, void *stream) {
  LweBootstrapKey64 *bsk = get_bootstrap_key_u64(context);
  size_t bsk_buffer_len =
      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * poly_size * level;
@@ -105,20 +130,41 @@ void *move_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
  cuda_initialize_twiddles(poly_size, gpu_idx);
  cuda_convert_lwe_bootstrap_key_64(fbsk_gpu, bsk_buffer, stream, gpu_idx,
                                    input_lwe_dim, glwe_dim, level, poly_size);
+  // This is currently not 100% async as we have to free CPU memory after
+  // conversion
  cuda_synchronize_device(gpu_idx);
-  cuda_destroy_stream(stream, gpu_idx);
  free(bsk_buffer);
  return fbsk_gpu;
 }

-void move_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
-                    uint64_t out_offset, uint64_t out_size, uint64_t out_stride,
-                    void *ct_gpu, size_t size, uint32_t gpu_idx) {
-  void *stream = cuda_create_stream(gpu_idx);
+void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
+                              uint32_t level, uint32_t input_lwe_dim,
+                              uint32_t output_lwe_dim, uint32_t gpu_idx,
+                              void *stream) {
+  LweKeyswitchKey64 *ksk = get_keyswitch_key_u64(context);
+  size_t ksk_buffer_len = input_lwe_dim * (output_lwe_dim + 1) * level;
+  size_t ksk_buffer_size = sizeof(uint64_t) * ksk_buffer_len;
+  uint64_t *ksk_buffer =
+      (uint64_t *)aligned_alloc(U64_ALIGNMENT, ksk_buffer_size);
+  void *ksk_gpu = cuda_malloc(ksk_buffer_size, gpu_idx);
+  CAPI_ASSERT_ERROR(
+      default_engine_discard_convert_lwe_keyswitch_key_to_lwe_keyswitch_key_mut_view_u64_raw_ptr_buffers(
+          get_levelled_engine(), ksk, ksk_buffer));
+  cuda_memcpy_async_to_gpu(ksk_gpu, ksk_buffer, ksk_buffer_size, stream,
+                           gpu_idx);
+  // This is currently not 100% async as we have to free CPU memory after
+  // conversion
+  cuda_synchronize_device(gpu_idx);
+  free(ksk_buffer);
+  return ksk_gpu;
+}
+
+void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
+                            uint64_t out_offset, uint64_t out_size,
+                            uint64_t out_stride, void *ct_gpu, size_t size,
+                            uint32_t gpu_idx, void *stream) {
  cuda_memcpy_async_to_cpu(out_aligned + out_offset, ct_gpu,
                           size * sizeof(uint64_t), stream, gpu_idx);
-  cuda_synchronize_device(gpu_idx);
-  cuda_destroy_stream(stream, gpu_idx);
 }

 void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
@@ -139,14 +185,14 @@ void memref_bootstrap_lwe_cuda_u64(
  uint32_t gpu_idx = 0;
  void *stream = cuda_create_stream(gpu_idx);
  // move bsk to gpu
-  void *fbsk_gpu = move_bsk_to_gpu(context, input_lwe_dim, poly_size, level,
-                                   glwe_dim, gpu_idx);
+  void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
+                                           level, glwe_dim, gpu_idx, stream);
  // move input ciphertext into gpu
-  void *ct0_gpu = move_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
-                                 ct0_size, ct0_stride, gpu_idx);
+  void *ct0_gpu = memcpy_async_ct_to_gpu(ct0_allocated, ct0_aligned, ct0_offset,
+                                         ct0_size, ct0_stride, gpu_idx, stream);
  // move output ciphertext into gpu
-  void *out_gpu = move_ct_to_gpu(out_allocated, out_aligned, out_offset,
-                                 out_size, out_stride, gpu_idx);
+  void *out_gpu = memcpy_async_ct_to_gpu(out_allocated, out_aligned, out_offset,
+                                         out_size, out_stride, gpu_idx, stream);
  // construct LUT GLWE ciphertext
  uint64_t glwe_ct_len = poly_size * (glwe_dim + 1);
  uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
@@ -179,8 +225,8 @@ void memref_bootstrap_lwe_cuda_u64(
      fbsk_gpu, input_lwe_dim, poly_size, base_log, level, num_samples,
      num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
  // copy output ciphertext back to cpu
-  move_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size, out_stride,
-                 out_gpu, out_size, gpu_idx);
+  memcpy_async_ct_to_cpu(out_allocated, out_aligned, out_offset, out_size,
+                         out_stride, out_gpu, out_size, gpu_idx, stream);
  cuda_synchronize_device(gpu_idx);
  // free memory that we allocated on gpu
  cuda_drop(fbsk_gpu, gpu_idx);
--- a/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir
+++ b/compiler/tests/check_tests/Conversion/ConcreteToBConcrete/gpu_ops.mlir
@@ -0,0 +1,31 @@
+// RUN: concretecompiler --passes concrete-to-bconcrete --action=dump-bconcrete --use-gpu %s 2>&1| FileCheck %s
+
+
+//CHECK: func.func @main(%arg0: tensor<1025xi64>) -> tensor<1025xi64> {
+//CHECK:   %c1_i32 = arith.constant 1 : i32
+//CHECK:   %c8_i32 = arith.constant 8 : i32
+//CHECK:   %c2_i32 = arith.constant 2 : i32
+//CHECK:   %c1024_i32 = arith.constant 1024 : i32
+//CHECK:   %c575_i32 = arith.constant 575 : i32
+//CHECK:   %cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
+//CHECK:   %c5_i32 = arith.constant 5 : i32
+//CHECK:   %c2_i32_0 = arith.constant 2 : i32
+//CHECK:   %c575_i32_1 = arith.constant 575 : i32
+//CHECK:   %c1024_i32_2 = arith.constant 1024 : i32
+//CHECK:   %0 = "BConcrete.keyswitch_lwe_gpu_buffer"(%arg0, %c5_i32, %c2_i32_0, %c1024_i32_2, %c575_i32_1) : (tensor<1025xi64>, i32, i32, i32, i32) -> tensor<576xi64>
+//CHECK:   %1 = "BConcrete.bootstrap_lwe_gpu_buffer"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (tensor<576xi64>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> tensor<1025xi64>
+//CHECK:   return %1 : tensor<1025xi64>
+//CHECK: }
+func.func @main(%arg0: !Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<1024,2> {
+  %c1_i32 = arith.constant 1 : i32
+  %c8_i32 = arith.constant 8 : i32
+  %c2_i32 = arith.constant 2 : i32
+  %c1024_i32 = arith.constant 1024 : i32
+  %c575_i32 = arith.constant 575 : i32
+  %cst = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi64>
+  %0 = "Concrete.keyswitch_lwe"(%arg0) {baseLog = 2 : i32, level = 5 : i32} : (!Concrete.lwe_ciphertext<1024,2>) -> !Concrete.lwe_ciphertext<575,2>
+  %1 = "Concrete.bootstrap_lwe"(%0, %cst, %c575_i32, %c1024_i32, %c2_i32, %c8_i32, %c1_i32, %c2_i32) : (!Concrete.lwe_ciphertext<575,2>, tensor<4xi64>, i32, i32, i32, i32, i32, i32) -> !Concrete.lwe_ciphertext<1024,2>
+  return %1 : !Concrete.lwe_ciphertext<1024,2>
+}
+
+