From 2a89f62c1a9e36f505ebe35718f41f696401bf62 Mon Sep 17 00:00:00 2001 From: Quentin Bourgerie Date: Tue, 6 Dec 2022 11:59:39 +0100 Subject: [PATCH] fix(cuda): Include cuda_runtime.h in device.h to include the defininition of cudaStream_t --- compiler/CMakeLists.txt | 1 + compiler/concrete-core | 2 +- .../include/concretelang/Runtime/context.h | 7 ------- compiler/lib/Runtime/wrappers.cpp | 21 ++++++++++--------- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/compiler/CMakeLists.txt b/compiler/CMakeLists.txt index 288ed7478..58fe46cfa 100644 --- a/compiler/CMakeLists.txt +++ b/compiler/CMakeLists.txt @@ -78,6 +78,7 @@ if(CONCRETELANG_CUDA_SUPPORT) link_directories(${CUDAToolkit_LIBRARY_DIR}) add_subdirectory(${CONCRETE_CORE_PATH}/concrete-cuda/cuda) include_directories(${CONCRETE_CORE_PATH}/concrete-cuda/cuda/include) + include_directories(${CUDAToolkit_INCLUDE_DIRS}) add_compile_options(-DCONCRETELANG_CUDA_SUPPORT) endif() diff --git a/compiler/concrete-core b/compiler/concrete-core index f1246ac4f..bf79f5db6 160000 --- a/compiler/concrete-core +++ b/compiler/concrete-core @@ -1 +1 @@ -Subproject commit f1246ac4f326856aa5520dbfa043bf9b25ebc6a9 +Subproject commit bf79f5db635cff7a224a44d01918aa6cf59b5493 diff --git a/compiler/include/concretelang/Runtime/context.h b/compiler/include/concretelang/Runtime/context.h index 1f1cc6a22..e2c429dfa 100644 --- a/compiler/include/concretelang/Runtime/context.h +++ b/compiler/include/concretelang/Runtime/context.h @@ -18,13 +18,6 @@ #include "concretelang/Common/Error.h" #ifdef CONCRETELANG_CUDA_SUPPORT -// We need to define the double2 struct from the CUDA backend header files -// This shouldn't be defined here, but included along with concrete-cuda header -// files -typedef struct double2 { - double x, y; -} double2; -// From concrete-cuda #include "bootstrap.h" #include "device.h" #include "keyswitch.h" diff --git a/compiler/lib/Runtime/wrappers.cpp b/compiler/lib/Runtime/wrappers.cpp index 84d7f1173..5e3ce1ee0 100644 --- a/compiler/lib/Runtime/wrappers.cpp +++ b/compiler/lib/Runtime/wrappers.cpp @@ -146,8 +146,8 @@ void memref_batched_keyswitch_lwe_cuda_u64( uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim, mlir::concretelang::RuntimeContext *context) { assert(out_size0 == ct0_size0); - assert(out_size1 == output_lwe_dim+1); - assert(ct0_size1 == input_lwe_dim+1); + assert(out_size1 == output_lwe_dim + 1); + assert(ct0_size1 == input_lwe_dim + 1); // TODO: Multi GPU uint32_t gpu_idx = 0; uint32_t num_samples = out_size0; @@ -167,9 +167,9 @@ void memref_batched_keyswitch_lwe_cuda_u64( void *out_gpu = alloc_and_memcpy_async_to_gpu( out_aligned, out_offset, out_batch_size, gpu_idx, stream); // Run the keyswitch kernel on the GPU - cuda_keyswitch_lwe_ciphertext_vector_64(stream, out_gpu, ct0_gpu, ksk_gpu, - input_lwe_dim, output_lwe_dim, - base_log, level, num_samples); + cuda_keyswitch_lwe_ciphertext_vector_64( + stream, gpu_idx, out_gpu, ct0_gpu, ksk_gpu, input_lwe_dim, output_lwe_dim, + base_log, level, num_samples); // Copy the output batch of ciphertext back to CPU memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx, stream); @@ -242,12 +242,13 @@ void memref_batched_bootstrap_lwe_cuda_u64( test_vector_idxes_size, stream, gpu_idx); // Run the bootstrap kernel on the GPU cuda_bootstrap_amortized_lwe_ciphertext_vector_64( - stream, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, fbsk_gpu, - input_lwe_dim, glwe_dim, poly_size, base_log, level, num_samples, - num_test_vectors, lwe_idx, cuda_get_max_shared_memory(gpu_idx)); + stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, + fbsk_gpu, input_lwe_dim, glwe_dim, poly_size, base_log, level, + num_samples, num_test_vectors, lwe_idx, + cuda_get_max_shared_memory(gpu_idx)); // Copy the output batch of ciphertext back to CPU - memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, - gpu_idx, stream); + memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx, + stream); cuda_synchronize_device(gpu_idx); // free memory that we allocated on gpu cuda_drop(ct0_gpu, gpu_idx);