feat(GPU-runtime): add a SDFG+DFR backend and runtime for dynamic GPU offloading.

2026-02-09 03:55:04 -05:00 · 2023-01-16 16:28:29 +00:00
parent 6eb8841652
commit 291019ba0f
7 changed files with 727 additions and 44 deletions
--- a/compilers/concrete-compiler/compiler/concrete-core
+++ b/compilers/concrete-compiler/compiler/concrete-core
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h
@@ -45,9 +45,9 @@ typedef struct RuntimeContext {
 #ifdef CONCRETELANG_CUDA_SUPPORT
    for (int i = 0; i < num_devices; ++i) {
      if (bsk_gpu[i] != nullptr)
-	cuda_drop(bsk_gpu[i], i);
+        cuda_drop(bsk_gpu[i], i);
      if (ksk_gpu[i] != nullptr)
-	cuda_drop(ksk_gpu[i], i);
+        cuda_drop(ksk_gpu[i], i);
    }
 #endif
  };
@@ -94,15 +94,15 @@ public:
    size_t bsk_buffer_len = bsk.size();
    size_t bsk_gpu_buffer_size = bsk_buffer_len * sizeof(double);

-    void *bsk_gpu_tmp = cuda_malloc(bsk_gpu_buffer_size, gpu_idx);
-    cuda_initialize_twiddles(poly_size, gpu_idx);
-    cuda_convert_lwe_bootstrap_key_64(bsk_gpu_tmp, (void *)bsk.buffer(), stream,
-                                      gpu_idx, input_lwe_dim, glwe_dim, level,
-                                      poly_size);
-    // This is currently not 100% async as
-    // we have to free CPU memory after
-    // conversion
-    cuda_synchronize_device(gpu_idx);
+    void *bsk_gpu_tmp =
+        cuda_malloc_async(bsk_gpu_buffer_size, (cudaStream_t *)stream, gpu_idx);
+    cuda_convert_lwe_bootstrap_key_64(
+        bsk_gpu_tmp, const_cast<uint64_t *>(bsk.buffer()),
+        (cudaStream_t *)stream, gpu_idx, input_lwe_dim, glwe_dim, level,
+        poly_size);
+    // Synchronization here is not optional as it works with mutex to
+    // prevent other GPU streams from reading partially copied keys.
+    cudaStreamSynchronize(*(cudaStream_t *)stream);
    bsk_gpu[gpu_idx] = bsk_gpu_tmp;
    return bsk_gpu[gpu_idx];
  }
@@ -122,11 +122,14 @@ public:

    size_t ksk_buffer_size = sizeof(uint64_t) * ksk.size();

-    void *ksk_gpu_tmp = cuda_malloc(ksk_buffer_size, gpu_idx);
+    void *ksk_gpu_tmp =
+        cuda_malloc_async(ksk_buffer_size, (cudaStream_t *)stream, gpu_idx);

-    cuda_memcpy_async_to_gpu(ksk_gpu_tmp, (void *)ksk.buffer(), ksk_buffer_size,
-                             stream, gpu_idx);
-    cuda_synchronize_device(gpu_idx);
+    cuda_memcpy_async_to_gpu(ksk_gpu_tmp, const_cast<uint64_t *>(ksk.buffer()),
+                             ksk_buffer_size, (cudaStream_t *)stream, gpu_idx);
+    // Synchronization here is not optional as it works with mutex to
+    // prevent other GPU streams from reading partially copied keys.
+    cudaStreamSynchronize(*(cudaStream_t *)stream);
    ksk_gpu[gpu_idx] = ksk_gpu_tmp;
    return ksk_gpu[gpu_idx];
  }
--- a/compilers/concrete-compiler/compiler/lib/Conversion/ExtractSDFGOps/ExtractSDFGOps.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Conversion/ExtractSDFGOps/ExtractSDFGOps.cpp
@@ -171,7 +171,7 @@ struct ExtractSDFGOpsPass : public ExtractSDFGOpsBase<ExtractSDFGOpsPass> {
        func.getLoc(), rewriter.getType<SDFG::DFGType>());
    SDFG::Start start = rewriter.create<SDFG::Start>(func.getLoc(), dfg);

-    rewriter.setInsertionPoint(func.getBlocks().front().getTerminator());
+    rewriter.setInsertionPoint(func.getBlocks().back().getTerminator());
    rewriter.create<SDFG::Shutdown>(func.getLoc(), dfg);

    mlir::ImplicitLocOpBuilder ilb(func.getLoc(), rewriter);
--- a/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
@@ -1,4 +1,8 @@
-add_library(ConcretelangRuntime SHARED context.cpp wrappers.cpp DFRuntime.cpp StreamEmulator.cpp)
+if(CONCRETELANG_CUDA_SUPPORT)
+  add_library(ConcretelangRuntime SHARED context.cpp wrappers.cpp DFRuntime.cpp GPUDFG.cpp)
+else()
+  add_library(ConcretelangRuntime SHARED context.cpp wrappers.cpp DFRuntime.cpp StreamEmulator.cpp)
+endif()

 add_dependencies(ConcretelangRuntime concrete_cpu)

@@ -9,7 +13,7 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
 endif()

 if(CONCRETELANG_CUDA_SUPPORT)
-  target_link_libraries(ConcretelangRuntime PRIVATE concrete_cuda)
+  target_link_libraries(ConcretelangRuntime LINK_PUBLIC concrete_cuda)
 endif()

 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -39,5 +43,9 @@ target_link_libraries(
         $<TARGET_OBJECTS:mlir_float16_utils>
         $<TARGET_OBJECTS:MLIRSparseTensorRuntime>)

-install(TARGETS ConcretelangRuntime omp EXPORT ConcretelangRuntime)
+if(CONCRETELANG_CUDA_SUPPORT)
+  install(TARGETS ConcretelangRuntime omp concrete_cuda EXPORT ConcretelangRuntime)
+else()
+  install(TARGETS ConcretelangRuntime omp EXPORT ConcretelangRuntime)
+endif()
 install(EXPORT ConcretelangRuntime DESTINATION "./")
--- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -0,0 +1,636 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete-compiler-internal/blob/main/LICENSE.txt
+// for license information.
+
+#include <atomic>
+#include <cstdarg>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <numeric>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <concretelang/ClientLib/Types.h>
+#include <concretelang/Runtime/stream_emulator_api.h>
+#include <concretelang/Runtime/wrappers.h>
+
+#ifdef CONCRETELANG_CUDA_SUPPORT
+#include "bootstrap.h"
+#include "device.h"
+#include "keyswitch.h"
+#include "linear_algebra.h"
+
+using MemRef2 = concretelang::clientlib::MemRefDescriptor<2>;
+using RuntimeContext = mlir::concretelang::RuntimeContext;
+
+namespace mlir {
+namespace concretelang {
+namespace gpu_dfg {
+namespace {
+
+static std::atomic<size_t> next_device = {0};
+static size_t num_devices = 0;
+
+struct Void {};
+union Param {
+  Void _;
+  uint32_t val;
+};
+union Context {
+  Void _;
+  RuntimeContext *val;
+};
+static const int32_t host_location = -1;
+struct Stream;
+struct Dependence;
+struct PBS_buffer {
+  PBS_buffer(void *stream, uint32_t gpu_idx, uint32_t glwe_dimension,
+             uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count)
+      : max_pbs_buffer_samples(input_lwe_ciphertext_count),
+        glwe_dim(glwe_dimension), poly_size(polynomial_size),
+        gpu_stream(stream), gpu_index(gpu_idx) {
+    scratch_cuda_bootstrap_amortized_64(
+        gpu_stream, gpu_index, &pbs_buffer, glwe_dim, poly_size,
+        max_pbs_buffer_samples, cuda_get_max_shared_memory(gpu_index), true);
+  }
+  ~PBS_buffer() {
+    assert(pbs_buffer != nullptr);
+    cleanup_cuda_bootstrap_amortized(gpu_stream, gpu_index, &pbs_buffer);
+  }
+  int8_t *get_pbs_buffer(void *stream, uint32_t gpu_idx,
+                         uint32_t glwe_dimension, uint32_t polynomial_size,
+                         uint32_t input_lwe_ciphertext_count) {
+    assert(glwe_dimension == glwe_dim);
+    assert(polynomial_size == poly_size);
+    assert(input_lwe_ciphertext_count <= max_pbs_buffer_samples);
+    assert(stream == gpu_stream);
+    assert(gpu_idx == gpu_index);
+    assert(pbs_buffer != nullptr);
+    return pbs_buffer;
+  }
+
+private:
+  int8_t *pbs_buffer;
+  uint32_t max_pbs_buffer_samples;
+  uint32_t glwe_dim;
+  uint32_t poly_size;
+  void *gpu_stream;
+  uint32_t gpu_index;
+};
+struct GPU_DFG {
+  uint32_t gpu_idx;
+  void *gpu_stream;
+  GPU_DFG(uint32_t idx) : gpu_idx(idx), pbs_buffer(nullptr) {
+    gpu_stream = cuda_create_stream(idx);
+  }
+  ~GPU_DFG() {
+    if (pbs_buffer != nullptr)
+      delete pbs_buffer;
+    free_streams();
+    cuda_destroy_stream((cudaStream_t *)gpu_stream, gpu_idx);
+    free_stream_order_dependent_data();
+  }
+  inline void register_stream(Stream *s) { streams.push_back(s); }
+  inline void register_stream_order_dependent_allocation(void *p) {
+    to_free_list.push_back(p);
+  }
+  inline void free_stream_order_dependent_data() {
+    for (auto p : to_free_list)
+      free(p);
+    to_free_list.clear();
+  }
+  inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
+                                uint32_t polynomial_size,
+                                uint32_t input_lwe_ciphertext_count) {
+    if (pbs_buffer == nullptr)
+      pbs_buffer = new PBS_buffer(gpu_stream, gpu_idx, glwe_dimension,
+                                  polynomial_size, input_lwe_ciphertext_count);
+    return pbs_buffer->get_pbs_buffer(gpu_stream, gpu_idx, glwe_dimension,
+                                      polynomial_size,
+                                      input_lwe_ciphertext_count);
+  }
+  void drop_pbs_buffer() {
+    delete pbs_buffer;
+    pbs_buffer = nullptr;
+  }
+  void free_streams();
+
+private:
+  std::list<void *> to_free_list;
+  std::list<Stream *> streams;
+  PBS_buffer *pbs_buffer;
+};
+struct Dependence {
+  int32_t location;
+  int32_t rank;
+  MemRef2 host_data;
+  void *device_data;
+  bool onHostReady;
+  bool hostAllocated;
+  bool used;
+  bool read;
+  Dependence(int32_t l, int32_t r, MemRef2 &hd, void *dd, bool ohr,
+             bool alloc = false)
+      : location(l), rank(r), host_data(hd), device_data(dd), onHostReady(ohr),
+        hostAllocated(alloc), used(false), read(false) {}
+  Dependence(int32_t l, int32_t r, uint64_t val, void *dd, bool ohr,
+             bool alloc = false)
+      : location(l), rank(r), device_data(dd), onHostReady(ohr),
+        hostAllocated(alloc), used(false), read(false) {
+    *host_data.aligned = val;
+  }
+  inline void free_data(GPU_DFG *dfg) {
+    if (location >= 0) {
+      cuda_drop_async(device_data, (cudaStream_t *)dfg->gpu_stream, location);
+    }
+    if (onHostReady && host_data.allocated != nullptr && hostAllocated) {
+      // As streams are not synchronized aside from the GET operation,
+      // we cannot free host-side data until after the synchronization
+      // point as it could still be used by an asynchronous operation.
+      dfg->register_stream_order_dependent_allocation(host_data.allocated);
+    }
+    delete (this);
+  }
+};
+struct Process {
+  std::vector<Stream *> input_streams;
+  std::vector<Stream *> output_streams;
+  GPU_DFG *dfg;
+  Param level;
+  Param base_log;
+  Param input_lwe_dim;
+  Param output_lwe_dim;
+  Param poly_size;
+  Param glwe_dim;
+  Param precision;
+  Param output_size;
+  Context ctx;
+  void (*fun)(Process *);
+  char name[80];
+};
+
+static inline void schedule_kernel(Process *p) { p->fun(p); }
+struct Stream {
+  stream_type type;
+  Dependence *dep;
+  Process *producer;
+  std::vector<Process *> consumers;
+  GPU_DFG *dfg;
+  Stream(stream_type t)
+      : type(t), dep(nullptr), producer(nullptr), dfg(nullptr) {}
+  ~Stream() {
+    if (dep != nullptr)
+      dep->free_data(dfg);
+    delete producer;
+  }
+  void put(Dependence *d) {
+    if (type == TS_STREAM_TYPE_X86_TO_TOPO_LSAP) {
+      assert(d->onHostReady &&
+             "Host-to-device stream should have data initially on host.");
+      size_t data_size = sizeof(uint64_t);
+      for (int i = 0; i < d->rank; ++i)
+        data_size *= d->host_data.sizes[i];
+      d->device_data = cuda_malloc_async(
+          data_size, (cudaStream_t *)dfg->gpu_stream, dfg->gpu_idx);
+      cuda_memcpy_async_to_gpu(
+          d->device_data, d->host_data.aligned + d->host_data.offset, data_size,
+          (cudaStream_t *)dfg->gpu_stream, dfg->gpu_idx);
+      d->location = dfg->gpu_idx;
+    }
+    if (type == TS_STREAM_TYPE_TOPO_TO_TOPO_LSAP)
+      assert(d->location == (int32_t)dfg->gpu_idx &&
+             "Data transfers between GPUs not supported yet");
+    // TODO: in case of TS_STREAM_TYPE_TOPO_TO_X86_LSAP, we could
+    // initiate transfer back to host early here - but need to
+    // allocate memory and then copy out again. Tradeoff might be
+    // worth testing.
+
+    // If a dependence was already present, schedule deallocation.
+    if (dep != nullptr)
+      dep->free_data(dfg);
+    dep = d;
+  }
+  void schedule_work() {
+    // If there's no producer process for this stream, it is fed by
+    // the control program - nothing to do
+    if (producer == nullptr) {
+      assert(dep != nullptr && "Data missing on control program stream.");
+      return;
+    }
+    // Recursively go up the DFG to check if new data is available
+    for (auto s : producer->input_streams)
+      s->schedule_work();
+    // Check if any of the inputs have changed - and if so recompute
+    // this value. Do not recompute if no changes.
+    for (auto s : producer->input_streams)
+      if (dep == nullptr || s->dep->used == false) {
+        schedule_kernel(producer);
+        break;
+      }
+  }
+  Dependence *get_on_host(MemRef2 &out, bool has_scheduled = false) {
+    if (!has_scheduled)
+      schedule_work();
+    assert(dep != nullptr && "GET on empty stream not allowed.");
+    dep->used = true;
+    // If this was already copied to host, copy out
+    if (dep->onHostReady) {
+      memref_copy_one_rank(dep->host_data.allocated, dep->host_data.aligned,
+                           dep->host_data.offset, dep->host_data.sizes[0],
+                           dep->host_data.strides[0], out.allocated,
+                           out.aligned, out.offset, out.sizes[0],
+                           out.strides[0]);
+    } else {
+      cuda_memcpy_async_to_cpu(out.aligned + out.offset, dep->device_data,
+                               out.sizes[0] * sizeof(uint64_t),
+                               (cudaStream_t *)dfg->gpu_stream, dep->location);
+      cudaStreamSynchronize(*(cudaStream_t *)dfg->gpu_stream);
+      // After this synchronization point, all of the host-side
+      // allocated memory can be freed as we know all asynchronous
+      // operations have finished.
+      dfg->free_stream_order_dependent_data();
+      dep->onHostReady = true;
+    }
+    return dep;
+  }
+  Dependence *get(int32_t location) {
+    schedule_work();
+    assert(dep != nullptr && "Dependence could not be computed.");
+    dep->used = true;
+    if (location == host_location) {
+      if (dep->onHostReady)
+        return dep;
+      size_t data_size = sizeof(uint64_t);
+      for (int i = 0; i < dep->rank; ++i)
+        data_size *= dep->host_data.sizes[i];
+      dep->host_data.allocated = dep->host_data.aligned =
+          (uint64_t *)malloc(data_size);
+      dep->hostAllocated = true;
+      get_on_host(dep->host_data, true);
+      return dep;
+    }
+    assert(dep->location == location &&
+           "Multi-GPU within the same SDFG not supported");
+    return dep;
+  }
+};
+
+void GPU_DFG::free_streams() {
+  streams.sort();
+  streams.unique();
+  for (auto s : streams)
+    delete s;
+}
+
+static inline mlir::concretelang::gpu_dfg::Process *
+make_process_1_1(void *dfg, void *sin1, void *sout, void (*fun)(Process *)) {
+  mlir::concretelang::gpu_dfg::Process *p =
+      new mlir::concretelang::gpu_dfg::Process;
+  mlir::concretelang::gpu_dfg::Stream *s1 =
+      (mlir::concretelang::gpu_dfg::Stream *)sin1;
+  mlir::concretelang::gpu_dfg::Stream *so =
+      (mlir::concretelang::gpu_dfg::Stream *)sout;
+  p->input_streams.push_back(s1);
+  p->dfg = (GPU_DFG *)dfg;
+  p->fun = fun;
+  p->output_streams.push_back(so);
+  s1->consumers.push_back(p);
+  so->producer = p;
+  so->dfg = s1->dfg = (GPU_DFG *)dfg;
+  p->dfg->register_stream(s1);
+  p->dfg->register_stream(so);
+  return p;
+}
+
+static inline mlir::concretelang::gpu_dfg::Process *
+make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout,
+                 void (*fun)(Process *)) {
+  mlir::concretelang::gpu_dfg::Process *p =
+      new mlir::concretelang::gpu_dfg::Process;
+  mlir::concretelang::gpu_dfg::Stream *s1 =
+      (mlir::concretelang::gpu_dfg::Stream *)sin1;
+  mlir::concretelang::gpu_dfg::Stream *s2 =
+      (mlir::concretelang::gpu_dfg::Stream *)sin2;
+  mlir::concretelang::gpu_dfg::Stream *so =
+      (mlir::concretelang::gpu_dfg::Stream *)sout;
+  p->input_streams.push_back(s1);
+  p->input_streams.push_back(s2);
+  p->dfg = (GPU_DFG *)dfg;
+  p->fun = fun;
+  p->output_streams.push_back(so);
+  s1->consumers.push_back(p);
+  s2->consumers.push_back(p);
+  so->producer = p;
+  so->dfg = s1->dfg = s2->dfg = (GPU_DFG *)dfg;
+  p->dfg->register_stream(s1);
+  p->dfg->register_stream(s2);
+  p->dfg->register_stream(so);
+  return p;
+}
+
+// Stream emulator processes
+void memref_keyswitch_lwe_u64_process(Process *p) {
+  Dependence *idep = p->input_streams[0]->get(p->dfg->gpu_idx);
+  void *ct0_gpu = idep->device_data;
+  void *ksk_gpu = p->ctx.val->get_ksk_gpu(
+      p->level.val, p->input_lwe_dim.val, p->output_lwe_dim.val,
+      p->dfg->gpu_idx, (cudaStream_t *)p->dfg->gpu_stream);
+  MemRef2 out = {0, 0, 0, {p->output_size.val}, {1}};
+  void *out_gpu =
+      cuda_malloc_async(out.sizes[0] * sizeof(uint64_t),
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  // Schedule the keyswitch kernel on the GPU
+  cuda_keyswitch_lwe_ciphertext_vector_64(
+      (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, ct0_gpu,
+      ksk_gpu, p->input_lwe_dim.val, p->output_lwe_dim.val, p->base_log.val,
+      p->level.val, 1 /*batch size*/);
+  Dependence *dep =
+      new Dependence((int32_t)p->dfg->gpu_idx, 1, out, out_gpu, false);
+  p->output_streams[0]->put(dep);
+}
+
+// Construct the glwe accumulator (on CPU) then put it on a stream as
+// input to the bootstrap
+void memref_bootstrap_glwe_accumulator_process(Process *p) {
+  uint64_t glwe_ct_len = p->poly_size.val * (p->glwe_dim.val + 1);
+  uint64_t glwe_ct_size = glwe_ct_len * sizeof(uint64_t);
+  uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size);
+  Dependence *idep = p->input_streams[0]->get(host_location);
+  MemRef2 &mtlu = idep->host_data;
+  auto tlu = mtlu.aligned + mtlu.offset;
+
+  // Glwe trivial encryption
+  for (size_t i = 0; i < p->poly_size.val * p->glwe_dim.val; i++) {
+    glwe_ct[i] = 0;
+  }
+  for (size_t i = 0; i < p->poly_size.val; i++) {
+    glwe_ct[p->poly_size.val * p->glwe_dim.val + i] = tlu[i];
+  }
+  MemRef2 m = {glwe_ct, glwe_ct, 0, {glwe_ct_len}, {1}};
+  Dependence *dep = new Dependence(host_location, 1, m, nullptr, true, true);
+  p->output_streams[0]->put(dep);
+}
+
+void memref_bootstrap_lwe_u64_process(Process *p) {
+  assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1);
+  uint32_t num_samples = 1; // TODO batching
+  void *fbsk_gpu = p->ctx.val->get_bsk_gpu(
+      p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val,
+      p->dfg->gpu_idx, (cudaStream_t *)p->dfg->gpu_stream);
+  Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx);
+  Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx);
+  void *ct0_gpu = idep0->device_data;
+  void *glwe_ct_gpu = idep1->device_data;
+
+  MemRef2 out = {0, 0, 0, {p->output_size.val}, {1}};
+  uint64_t out_batch_size = out.sizes[0];
+  void *out_gpu =
+      cuda_malloc_async(out_batch_size * sizeof(uint64_t),
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  // Move test vector indexes to the GPU, the test vector indexes is set of 0
+  uint32_t num_test_vectors = 1, lwe_idx = 0,
+           test_vector_idxes_size = num_samples * sizeof(uint64_t);
+  void *test_vector_idxes = malloc(test_vector_idxes_size);
+  memset(test_vector_idxes, 0, test_vector_idxes_size);
+  void *test_vector_idxes_gpu =
+      cuda_malloc_async(test_vector_idxes_size,
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
+                           test_vector_idxes_size,
+                           (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  // Schedule the bootstrap kernel on the GPU
+  int8_t *pbs_buffer =
+      p->dfg->get_pbs_buffer(p->glwe_dim.val, p->poly_size.val, num_samples);
+  cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+      (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu, glwe_ct_gpu,
+      test_vector_idxes_gpu, ct0_gpu, fbsk_gpu, (int8_t *)pbs_buffer,
+      p->input_lwe_dim.val, p->glwe_dim.val, p->poly_size.val, p->base_log.val,
+      p->level.val, num_samples, num_test_vectors, lwe_idx,
+      cuda_get_max_shared_memory(p->dfg->gpu_idx));
+  p->dfg->drop_pbs_buffer();
+  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)p->dfg->gpu_stream,
+                  p->dfg->gpu_idx);
+  Dependence *dep =
+      new Dependence((int32_t)p->dfg->gpu_idx, 1, out, out_gpu, false);
+  // As streams are not synchronized, we can only free this vector
+  // after a later synchronization point where we are guaranteed that
+  // this vector is no longer needed.
+  p->dfg->register_stream_order_dependent_allocation(test_vector_idxes);
+  p->output_streams[0]->put(dep);
+}
+
+void memref_add_lwe_ciphertexts_u64_process(Process *p) {
+  Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx);
+  Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx);
+  MemRef2 ct0 = idep0->host_data;
+  void *out_gpu =
+      cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t),
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}};
+  cuda_add_lwe_ciphertext_vector_64(
+      (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu,
+      idep0->device_data, idep1->device_data,
+      /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */);
+  Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false);
+  p->output_streams[0]->put(dep);
+}
+
+void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p) {
+  Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx);
+  Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx);
+  MemRef2 ct0 = idep0->host_data;
+  void *out_gpu =
+      cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t),
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}};
+  cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
+      (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu,
+      idep0->device_data, idep1->device_data,
+      /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */);
+
+  Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false);
+  p->output_streams[0]->put(dep);
+}
+
+void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p) {
+  Dependence *idep0 = p->input_streams[0]->get(p->dfg->gpu_idx);
+  Dependence *idep1 = p->input_streams[1]->get(p->dfg->gpu_idx);
+  MemRef2 ct0 = idep0->host_data;
+  void *out_gpu =
+      cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t),
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}};
+  cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+      (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu,
+      idep0->device_data, idep1->device_data,
+      /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */);
+
+  Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false);
+  p->output_streams[0]->put(dep);
+}
+
+void memref_negate_lwe_ciphertext_u64_process(Process *p) {
+  Dependence *idep = p->input_streams[0]->get(p->dfg->gpu_idx);
+  MemRef2 ct0 = idep->host_data;
+  void *out_gpu =
+      cuda_malloc_async(ct0.sizes[0] * sizeof(uint64_t),
+                        (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx);
+  MemRef2 out = {0, 0, 0, {ct0.sizes[0]}, {1}};
+  cuda_negate_lwe_ciphertext_vector_64(
+      (cudaStream_t *)p->dfg->gpu_stream, p->dfg->gpu_idx, out_gpu,
+      idep->device_data,
+      /*p->input_lwe_dim.val*/ ct0.sizes[0] - 1, 1 /* num_samples */);
+  Dependence *dep = new Dependence(p->dfg->gpu_idx, 1, out, out_gpu, false);
+  p->output_streams[0]->put(dep);
+}
+
+} // namespace
+} // namespace gpu_dfg
+} // namespace concretelang
+} // namespace mlir
+
+using namespace mlir::concretelang::gpu_dfg;
+
+// Code generation interface
+void stream_emulator_make_memref_add_lwe_ciphertexts_u64_process(void *dfg,
+                                                                 void *sin1,
+                                                                 void *sin2,
+                                                                 void *sout) {
+  Process *p = make_process_2_1(dfg, sin1, sin2, sout,
+                                memref_add_lwe_ciphertexts_u64_process);
+  static int count = 0;
+  sprintf(p->name, "add_lwe_ciphertexts_%d", count++);
+}
+
+void stream_emulator_make_memref_add_plaintext_lwe_ciphertext_u64_process(
+    void *dfg, void *sin1, void *sin2, void *sout) {
+  Process *p = make_process_2_1(
+      dfg, sin1, sin2, sout, memref_add_plaintext_lwe_ciphertext_u64_process);
+  static int count = 0;
+  sprintf(p->name, "add_plaintext_lwe_ciphertexts_%d", count++);
+}
+
+void stream_emulator_make_memref_mul_cleartext_lwe_ciphertext_u64_process(
+    void *dfg, void *sin1, void *sin2, void *sout) {
+  Process *p = make_process_2_1(
+      dfg, sin1, sin2, sout, memref_mul_cleartext_lwe_ciphertext_u64_process);
+  static int count = 0;
+  sprintf(p->name, "mul_cleartext_lwe_ciphertexts_%d", count++);
+}
+
+void stream_emulator_make_memref_negate_lwe_ciphertext_u64_process(void *dfg,
+                                                                   void *sin1,
+                                                                   void *sout) {
+  Process *p = make_process_1_1(dfg, sin1, sout,
+                                memref_negate_lwe_ciphertext_u64_process);
+  static int count = 0;
+  sprintf(p->name, "negate_lwe_ciphertext_%d", count++);
+}
+
+void stream_emulator_make_memref_keyswitch_lwe_u64_process(
+    void *dfg, void *sin1, void *sout, uint32_t level, uint32_t base_log,
+    uint32_t input_lwe_dim, uint32_t output_lwe_dim, uint32_t output_size,
+    void *context) {
+  Process *p =
+      make_process_1_1(dfg, sin1, sout, memref_keyswitch_lwe_u64_process);
+  p->level.val = level;
+  p->base_log.val = base_log;
+  p->input_lwe_dim.val = input_lwe_dim;
+  p->output_lwe_dim.val = output_lwe_dim;
+  p->output_size.val = output_size;
+  p->ctx.val = (RuntimeContext *)context;
+  static int count = 0;
+  sprintf(p->name, "keyswitch_%d", count++);
+}
+
+void stream_emulator_make_memref_bootstrap_lwe_u64_process(
+    void *dfg, void *sin1, void *sin2, void *sout, uint32_t input_lwe_dim,
+    uint32_t poly_size, uint32_t level, uint32_t base_log, uint32_t glwe_dim,
+    uint32_t precision, uint32_t output_size, void *context) {
+
+  // We need to generate two processes: one for building the glwe
+  // accumulator and one for the bootstrap (plus a stream to connect).
+  void *accu_s =
+      stream_emulator_make_memref_stream("", TS_STREAM_TYPE_X86_TO_TOPO_LSAP);
+  Process *accu_p = make_process_1_1(dfg, sin2, accu_s,
+                                     memref_bootstrap_glwe_accumulator_process);
+  accu_p->input_lwe_dim.val = input_lwe_dim;
+  accu_p->poly_size.val = poly_size;
+  accu_p->level.val = level;
+  accu_p->base_log.val = base_log;
+  accu_p->glwe_dim.val = glwe_dim;
+  accu_p->precision.val = precision;
+  accu_p->output_size.val = output_size;
+  accu_p->ctx.val = (RuntimeContext *)context;
+  static int count_ = 0;
+  sprintf(accu_p->name, "glwe_accumulator_%d", count_++);
+
+  Process *p = make_process_2_1(dfg, sin1, accu_s, sout,
+                                memref_bootstrap_lwe_u64_process);
+  p->input_lwe_dim.val = input_lwe_dim;
+  p->poly_size.val = poly_size;
+  p->level.val = level;
+  p->base_log.val = base_log;
+  p->glwe_dim.val = glwe_dim;
+  p->precision.val = precision;
+  p->output_size.val = output_size;
+  p->ctx.val = (RuntimeContext *)context;
+  static int count = 0;
+  sprintf(p->name, "bootstrap_%d", count++);
+}
+
+void *stream_emulator_make_uint64_stream(const char *name, stream_type stype) {
+  return (void *)new Stream(stype);
+}
+void stream_emulator_put_uint64(void *stream, uint64_t e) {
+  Stream *s = (Stream *)stream;
+  uint64_t *data = (uint64_t *)malloc(sizeof(uint64_t));
+  *data = e;
+  MemRef2 m = {data, data, 0, {1}, {1}};
+  Dependence *dep = new Dependence(host_location, 1, m, nullptr, true, true);
+  s->put(dep);
+}
+uint64_t stream_emulator_get_uint64(void *stream) {
+  uint64_t res;
+  auto s = (Stream *)stream;
+  MemRef2 m = {&res, &res, 0, {1}, {1}};
+  s->get_on_host(m);
+  return res;
+}
+
+void *stream_emulator_make_memref_stream(const char *name, stream_type stype) {
+  return (void *)new Stream(stype);
+}
+void stream_emulator_put_memref(void *stream, uint64_t *allocated,
+                                uint64_t *aligned, uint64_t offset,
+                                uint64_t size, uint64_t stride) {
+  Stream *s = (Stream *)stream;
+  MemRef2 m = {allocated, aligned, offset, {size}, {stride}};
+  Dependence *dep = new Dependence(host_location, 1, m, nullptr, true);
+  s->put(dep);
+}
+void stream_emulator_get_memref(void *stream, uint64_t *out_allocated,
+                                uint64_t *out_aligned, uint64_t out_offset,
+                                uint64_t out_size, uint64_t out_stride) {
+  MemRef2 mref = {
+      out_allocated, out_aligned, out_offset, {out_size}, {out_stride}};
+  auto s = (Stream *)stream;
+  s->get_on_host(mref);
+}
+
+void *stream_emulator_init() {
+  int num;
+  if (num_devices == 0) {
+    assert(cudaGetDeviceCount(&num) == cudaSuccess);
+    num_devices = num;
+  }
+  int device = next_device.fetch_add(1) % num_devices;
+  return new GPU_DFG(device);
+}
+void stream_emulator_run(void *dfg) {}
+void stream_emulator_delete(void *dfg) { delete (GPU_DFG *)dfg; }
+#endif
--- a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
@@ -43,9 +43,9 @@ void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
                                    uint64_t buf_size, uint32_t gpu_idx,
                                    void *stream) {
  size_t buf_size_ = buf_size * sizeof(uint64_t);
-  void *ct_gpu = cuda_malloc(buf_size_, gpu_idx);
-  cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_, stream,
-                           gpu_idx);
+  void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t *)stream, gpu_idx);
+  cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_,
+                           (cudaStream_t *)stream, gpu_idx);
  return ct_gpu;
 }

@@ -53,7 +53,8 @@ void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset,
                         uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx,
                         void *stream) {
  cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu,
-                           buf_size * sizeof(uint64_t), stream, gpu_idx);
+                           buf_size * sizeof(uint64_t), (cudaStream_t *)stream,
+                           gpu_idx);
 }

 void free_from_gpu(void *gpu_ptr, uint32_t gpu_idx = 0) {
@@ -128,8 +129,9 @@ void memref_batched_keyswitch_lwe_cuda_u64(
  // Move the input and output batch of ciphertexts to the GPU
  // TODO: The allocation should be done by the compiler codegen
  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
-      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
-  void *out_gpu = cuda_malloc(out_batch_size * sizeof(uint64_t), gpu_idx);
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
+  void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
+                                    (cudaStream_t *)stream, gpu_idx);
  // Run the keyswitch kernel on the GPU
  cuda_keyswitch_lwe_ciphertext_vector_64(
      stream, gpu_idx, out_gpu, ct0_gpu, ksk_gpu, input_lwe_dim, output_lwe_dim,
@@ -141,7 +143,7 @@ void memref_batched_keyswitch_lwe_cuda_u64(
  // free memory that we allocated on gpu
  cuda_drop(ct0_gpu, gpu_idx);
  cuda_drop(out_gpu, gpu_idx);
-  cuda_destroy_stream(stream, gpu_idx);
+  cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
 }

 void memref_batched_bootstrap_lwe_cuda_u64(
@@ -155,11 +157,13 @@ void memref_batched_bootstrap_lwe_cuda_u64(
    uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t precision,
    mlir::concretelang::RuntimeContext *context) {
  assert(out_size0 == ct0_size0);
+  assert(out_size1 == glwe_dim * poly_size + 1);
  // TODO: Multi GPU
  uint32_t gpu_idx = 0;
  uint32_t num_samples = out_size0;
  uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
  uint64_t out_batch_size = out_size0 * out_size1;
+  int8_t *pbs_buffer = nullptr;

  // Create the cuda stream
  // TODO: Should be created by the compiler codegen
@@ -170,8 +174,9 @@ void memref_batched_bootstrap_lwe_cuda_u64(
  // Move the input and output batch of ciphertext to the GPU
  // TODO: The allocation should be done by the compiler codegen
  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
-      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, stream);
-  void *out_gpu = cuda_malloc(out_batch_size * sizeof(uint64_t), gpu_idx);
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
+  void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
+                                    (cudaStream_t *)stream, gpu_idx);
  // Construct the glwe accumulator (on CPU)
  // TODO: Should be done outside of the bootstrap call, compile time if
  // possible. Refactor in progress
@@ -188,36 +193,42 @@ void memref_batched_bootstrap_lwe_cuda_u64(
  }

  // Move the glwe accumulator to the GPU
-  void *glwe_ct_gpu =
-      alloc_and_memcpy_async_to_gpu(glwe_ct, 0, glwe_ct_size, gpu_idx, stream);
+  void *glwe_ct_gpu = alloc_and_memcpy_async_to_gpu(
+      glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t *)stream);

  // Move test vector indexes to the GPU, the test vector indexes is set of 0
  uint32_t num_test_vectors = 1, lwe_idx = 0,
-           test_vector_idxes_size = num_samples * sizeof(uint32_t);
+           test_vector_idxes_size = num_samples * sizeof(uint64_t);
  void *test_vector_idxes = malloc(test_vector_idxes_size);
  memset(test_vector_idxes, 0, test_vector_idxes_size);
-  void *test_vector_idxes_gpu = cuda_malloc(test_vector_idxes_size, gpu_idx);
+  void *test_vector_idxes_gpu = cuda_malloc_async(
+      test_vector_idxes_size, (cudaStream_t *)stream, gpu_idx);
  cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
-                           test_vector_idxes_size, stream, gpu_idx);
+                           test_vector_idxes_size, (cudaStream_t *)stream,
+                           gpu_idx);
+  // Allocate PBS buffer on GPU
+  scratch_cuda_bootstrap_amortized_64(
+      stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
+      cuda_get_max_shared_memory(gpu_idx), true);
  // Run the bootstrap kernel on the GPU
  cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
      stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
-      fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level,
+      fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
      num_samples, num_test_vectors, lwe_idx,
      cuda_get_max_shared_memory(gpu_idx));
+  cleanup_cuda_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
  // Copy the output batch of ciphertext back to CPU
  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
                      stream);
-  cuda_synchronize_device(gpu_idx);
+  // free memory that we allocated on gpu
+  cuda_drop_async(ct0_gpu, (cudaStream_t *)stream, gpu_idx);
+  cuda_drop_async(out_gpu, (cudaStream_t *)stream, gpu_idx);
+  cuda_drop_async(glwe_ct_gpu, (cudaStream_t *)stream, gpu_idx);
+  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)stream, gpu_idx);
+  cudaStreamSynchronize(*(cudaStream_t *)stream);
  // Free the glwe accumulator (on CPU)
  free(glwe_ct);
-  // free memory that we allocated on gpu
-  cuda_drop(ct0_gpu, gpu_idx);
-  cuda_drop(out_gpu, gpu_idx);
-  cuda_drop(glwe_ct_gpu, gpu_idx);
-  cuda_drop(test_vector_idxes_gpu, gpu_idx);
-
-  cuda_destroy_stream(stream, gpu_idx);
+  cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
 }

 #endif
--- a/compilers/concrete-compiler/compiler/tests/unit_tests/concretelang/SDFG/SDFG_unit_tests.cpp
+++ b/compilers/concrete-compiler/compiler/tests/unit_tests/concretelang/SDFG/SDFG_unit_tests.cpp
@@ -44,8 +44,11 @@ compile(std::string outputLib, std::string source,
  mlir::concretelang::CompilerEngine ce{ccx};
  mlir::concretelang::CompilationOptions options(funcname);
  options.emitSDFGOps = true;
+#ifdef CONCRETELANG_CUDA_SUPPORT
+  options.emitGPUOps = true;
+#endif
 #ifdef CONCRETELANG_DATAFLOW_TESTING_ENABLED
-  // options.dataflowParallelize = true;
+  options.dataflowParallelize = true;
 #endif
  ce.setCompilationOptions(options);
  auto result = ce.compile(sources, outputLib);
@@ -189,3 +192,25 @@ func.func @main(%arg0: !FHE.eint<3>) -> !FHE.eint<3> {
    ASSERT_EQ_OUTCOME(res, (scalar_out)a);
  }
 }
+
+TEST(SDFG_unit_tests, tlu_tree) {
+  std::string source = R"(
+func.func @main(%arg0: !FHE.eint<4>) -> !FHE.eint<4> {
+    %tlu_4 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : tensor<16xi64>
+    %1 = "FHE.apply_lookup_table"(%arg0, %tlu_4): (!FHE.eint<4>, tensor<16xi64>) -> (!FHE.eint<4>)
+    %2 = "FHE.apply_lookup_table"(%arg0, %tlu_4): (!FHE.eint<4>, tensor<16xi64>) -> (!FHE.eint<4>)
+    %3 = "FHE.apply_lookup_table"(%1, %tlu_4): (!FHE.eint<4>, tensor<16xi64>) -> (!FHE.eint<4>)
+    %4 = "FHE.apply_lookup_table"(%2, %tlu_4): (!FHE.eint<4>, tensor<16xi64>) -> (!FHE.eint<4>)
+    %5 = "FHE.add_eint"(%3, %4): (!FHE.eint<4>, !FHE.eint<4>) -> (!FHE.eint<4>)
+    %6 = "FHE.apply_lookup_table"(%5, %tlu_4): (!FHE.eint<4>, tensor<16xi64>) -> (!FHE.eint<4>)
+    return %6: !FHE.eint<4>
+}
+)";
+  std::string outputLib = outputLibFromThis(this->test_info_);
+  auto compiled = compile(outputLib, source);
+  auto lambda = load<TestTypedLambda<scalar_out, scalar_in>>(outputLib);
+  for (auto a : values_3bits()) {
+    auto res = lambda.call(a);
+    ASSERT_EQ_OUTCOME(res, (scalar_out)((a * 2) % 16));
+  }
+}