From 6eb8841652fc28f35020586f216b4eea9254beec Mon Sep 17 00:00:00 2001 From: Antoniu Pop Date: Thu, 16 Feb 2023 10:30:25 +0000 Subject: [PATCH] feat(GPU-runtime): add per device cache of BS and KS keys. --- .../include/concretelang/Runtime/context.h | 53 +++++++++---------- .../compiler/lib/Runtime/context.cpp | 9 +++- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h index 3e4294616..a1acda734 100644 --- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h +++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h @@ -10,6 +10,7 @@ #include #include #include +#include #include "concretelang/ClientLib/EvaluationKeys.h" #include "concretelang/Common/Error.h" @@ -39,15 +40,14 @@ typedef struct FFT { typedef struct RuntimeContext { RuntimeContext() = delete; - RuntimeContext(::concretelang::clientlib::EvaluationKeys evaluationKeys); ~RuntimeContext() { #ifdef CONCRETELANG_CUDA_SUPPORT - if (bsk_gpu != nullptr) { - cuda_drop(bsk_gpu, 0); - } - if (ksk_gpu != nullptr) { - cuda_drop(ksk_gpu, 0); + for (int i = 0; i < num_devices; ++i) { + if (bsk_gpu[i] != nullptr) + cuda_drop(bsk_gpu[i], i); + if (ksk_gpu[i] != nullptr) + cuda_drop(ksk_gpu[i], i); } #endif }; @@ -80,13 +80,13 @@ public: void *get_bsk_gpu(uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level, uint32_t glwe_dim, uint32_t gpu_idx, void *stream) { - if (bsk_gpu != nullptr) { - return bsk_gpu; + if (bsk_gpu[gpu_idx] != nullptr) { + return bsk_gpu[gpu_idx]; } - const std::lock_guard guard(bsk_gpu_mutex); + const std::lock_guard guard(*bsk_gpu_mutex[gpu_idx]); - if (bsk_gpu != nullptr) { - return bsk_gpu; + if (bsk_gpu[gpu_idx] != nullptr) { + return bsk_gpu[gpu_idx]; } auto bsk = evaluationKeys.getBootstrapKey(0); @@ -103,20 +103,20 @@ public: // we have to free CPU memory after // conversion cuda_synchronize_device(gpu_idx); - bsk_gpu = bsk_gpu_tmp; - return bsk_gpu; + bsk_gpu[gpu_idx] = bsk_gpu_tmp; + return bsk_gpu[gpu_idx]; } void *get_ksk_gpu(uint32_t level, uint32_t input_lwe_dim, uint32_t output_lwe_dim, uint32_t gpu_idx, void *stream) { - if (ksk_gpu != nullptr) { - return ksk_gpu; + if (ksk_gpu[gpu_idx] != nullptr) { + return ksk_gpu[gpu_idx]; } - const std::lock_guard guard(ksk_gpu_mutex); - if (ksk_gpu != nullptr) { - return ksk_gpu; + const std::lock_guard guard(*ksk_gpu_mutex[gpu_idx]); + if (ksk_gpu[gpu_idx] != nullptr) { + return ksk_gpu[gpu_idx]; } auto ksk = evaluationKeys.getKeyswitchKey(0); @@ -126,21 +126,18 @@ public: cuda_memcpy_async_to_gpu(ksk_gpu_tmp, (void *)ksk.buffer(), ksk_buffer_size, stream, gpu_idx); - // This is currently not 100% async as - // we have to free CPU memory after - // conversion cuda_synchronize_device(gpu_idx); - ksk_gpu = ksk_gpu_tmp; - return ksk_gpu; + ksk_gpu[gpu_idx] = ksk_gpu_tmp; + return ksk_gpu[gpu_idx]; } private: - std::mutex bsk_gpu_mutex; - void *bsk_gpu; - std::mutex ksk_gpu_mutex; - void *ksk_gpu; + std::vector> bsk_gpu_mutex; + std::vector bsk_gpu; + std::vector> ksk_gpu_mutex; + std::vector ksk_gpu; + int num_devices; #endif - } RuntimeContext; } // namespace concretelang diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp index 18feb159b..589e08b7c 100644 --- a/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp +++ b/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp @@ -71,8 +71,13 @@ RuntimeContext::RuntimeContext(clientlib::EvaluationKeys evaluationKeys) } #ifdef CONCRETELANG_CUDA_SUPPORT - bsk_gpu = nullptr; - ksk_gpu = nullptr; + assert(cudaGetDeviceCount(&num_devices) == cudaSuccess); + bsk_gpu.resize(num_devices, nullptr); + ksk_gpu.resize(num_devices, nullptr); + for (int i = 0; i < num_devices; ++i) { + bsk_gpu_mutex.push_back(std::make_unique()); + ksk_gpu_mutex.push_back(std::make_unique()); + } #endif } }