From 2a89f62c1a9e36f505ebe35718f41f696401bf62 Mon Sep 17 00:00:00 2001
From: Quentin Bourgerie <bourgerie.quentin@gmail.com>
Date: Tue, 6 Dec 2022 11:59:39 +0100
Subject: [PATCH] fix(cuda): Include cuda_runtime.h in device.h to include the
 defininition of cudaStream_t

---
 compiler/CMakeLists.txt                       |  1 +
 compiler/concrete-core                        |  2 +-
 .../include/concretelang/Runtime/context.h    |  7 -------
 compiler/lib/Runtime/wrappers.cpp             | 21 ++++++++++---------
 4 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt
index 288ed7478..58fe46cfa 100644
--- a/compiler/CMakeLists.txt
+++ b/compiler/CMakeLists.txt
@@ -78,6 +78,7 @@ if(CONCRETELANG_CUDA_SUPPORT)
   link_directories(${CUDAToolkit_LIBRARY_DIR})
   add_subdirectory(${CONCRETE_CORE_PATH}/concrete-cuda/cuda)
   include_directories(${CONCRETE_CORE_PATH}/concrete-cuda/cuda/include)
+  include_directories(${CUDAToolkit_INCLUDE_DIRS})
   add_compile_options(-DCONCRETELANG_CUDA_SUPPORT)
 endif()
 
diff --git a/compiler/concrete-core b/compiler/concrete-core
index f1246ac4f..bf79f5db6 160000
--- a/compiler/concrete-core
+++ b/compiler/concrete-core
@@ -1 +1 @@
-Subproject commit f1246ac4f326856aa5520dbfa043bf9b25ebc6a9
+Subproject commit bf79f5db635cff7a224a44d01918aa6cf59b5493
diff --git a/compiler/include/concretelang/Runtime/context.h b/compiler/include/concretelang/Runtime/context.h
index 1f1cc6a22..e2c429dfa 100644
--- a/compiler/include/concretelang/Runtime/context.h
+++ b/compiler/include/concretelang/Runtime/context.h
@@ -18,13 +18,6 @@
 #include "concretelang/Common/Error.h"
 
 #ifdef CONCRETELANG_CUDA_SUPPORT
-// We need to define the double2 struct from the CUDA backend header files
-// This shouldn't be defined here, but included along with concrete-cuda header
-// files
-typedef struct double2 {
-  double x, y;
-} double2;
-// From concrete-cuda
 #include "bootstrap.h"
 #include "device.h"
 #include "keyswitch.h"
diff --git a/compiler/lib/Runtime/wrappers.cpp b/compiler/lib/Runtime/wrappers.cpp
index 84d7f1173..5e3ce1ee0 100644
--- a/compiler/lib/Runtime/wrappers.cpp
+++ b/compiler/lib/Runtime/wrappers.cpp
@@ -146,8 +146,8 @@ void memref_batched_keyswitch_lwe_cuda_u64(
     uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
     mlir::concretelang::RuntimeContext *context) {
   assert(out_size0 == ct0_size0);
-  assert(out_size1 == output_lwe_dim+1);
-  assert(ct0_size1 == input_lwe_dim+1);
+  assert(out_size1 == output_lwe_dim + 1);
+  assert(ct0_size1 == input_lwe_dim + 1);
   // TODO: Multi GPU
   uint32_t gpu_idx = 0;
   uint32_t num_samples = out_size0;
@@ -167,9 +167,9 @@ void memref_batched_keyswitch_lwe_cuda_u64(
   void *out_gpu = alloc_and_memcpy_async_to_gpu(
       out_aligned, out_offset, out_batch_size, gpu_idx, stream);
   // Run the keyswitch kernel on the GPU
-  cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu,
-                                          input_lwe_dim, output_lwe_dim,
-                                          base_log, level, num_samples);
+  cuda_keyswitch_lwe_ciphertext_vector_64(
+      stream, gpu_idx, out_gpu, ct0_gpu, ksk_gpu, input_lwe_dim, output_lwe_dim,
+      base_log, level, num_samples);
   // Copy the output batch of ciphertext back to CPU
   memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
                       stream);
@@ -242,12 +242,13 @@ void memref_batched_bootstrap_lwe_cuda_u64(
                            test_vector_idxes_size, stream, gpu_idx);
   // Run the bootstrap kernel on the GPU
   cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-      stream, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, fbsk_gpu,
-      input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples,
-      num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx));
+      stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
+      fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level,
+      num_samples, num_test_vectors, lwe_idx,
+      cuda_get_max_shared_memory(gpu_idx));
   // Copy the output batch of ciphertext back to CPU
-  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu,
-                         gpu_idx, stream);
+  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
+                      stream);
   cuda_synchronize_device(gpu_idx);
   // free memory that we allocated on gpu
   cuda_drop(ct0_gpu, gpu_idx);