enhance(runtime/gpu): Cache keys copy to gpu

This commit is contained in:
Quentin Bourgerie
2022-10-14 20:40:49 +02:00
parent f36e1fe882
commit 792d46fa80
2 changed files with 101 additions and 54 deletions

View File

@@ -17,6 +17,19 @@
#include "concrete-core-ffi.h"
#include "concretelang/Common/Error.h"
#ifdef CONCRETELANG_CUDA_SUPPORT
// We need to define the double2 struct from the CUDA backend header files
// This shouldn't be defined here, but included along with concrete-cuda header
// files
typedef struct double2 {
double x, y;
} double2;
// From concrete-cuda
#include "bootstrap.h"
#include "device.h"
#include "keyswitch.h"
#endif
namespace mlir {
namespace concretelang {
@@ -24,16 +37,14 @@ typedef struct RuntimeContext {
RuntimeContext() {
CAPI_ASSERT_ERROR(new_default_engine(best_seeder, &default_engine));
#ifdef CONCRETELANG_CUDA_SUPPORT
bsk_gpu = nullptr;
ksk_gpu = nullptr;
#endif
}
/// Ensure that the engines map is not copied
RuntimeContext(const RuntimeContext &ctx)
: evaluationKeys(ctx.evaluationKeys) {
CAPI_ASSERT_ERROR(new_default_engine(best_seeder, &default_engine));
}
RuntimeContext(const RuntimeContext &&other)
: evaluationKeys(other.evaluationKeys),
default_engine(other.default_engine) {}
RuntimeContext(const RuntimeContext &ctx){};
~RuntimeContext() {
CAPI_ASSERT_ERROR(destroy_default_engine(default_engine));
@@ -43,6 +54,14 @@ typedef struct RuntimeContext {
if (fbsk != nullptr) {
CAPI_ASSERT_ERROR(destroy_fft_fourier_lwe_bootstrap_key_u64(fbsk));
}
#ifdef CONCRETELANG_CUDA_SUPPORT
if (bsk_gpu != nullptr) {
cuda_drop(bsk_gpu, 0);
}
if (ksk_gpu != nullptr) {
cuda_drop(ksk_gpu, 0);
}
#endif
}
FftEngine *get_fft_engine() {
@@ -80,6 +99,70 @@ typedef struct RuntimeContext {
return fbsk;
}
#ifdef CONCRETELANG_CUDA_SUPPORT
void *get_bsk_gpu(uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
uint32_t glwe_dim, uint32_t gpu_idx, void *stream) {
if (bsk_gpu != nullptr) {
return bsk_gpu;
}
const std::lock_guard<std::mutex> guard(bsk_gpu_mutex);
if (bsk_gpu != nullptr) {
return bsk_gpu;
}
LweBootstrapKey64 *bsk = get_bsk();
size_t bsk_buffer_len =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * poly_size * level;
size_t bsk_buffer_size = bsk_buffer_len * sizeof(uint64_t);
uint64_t *bsk_buffer =
(uint64_t *)aligned_alloc(U64_ALIGNMENT, bsk_buffer_size);
size_t bsk_gpu_buffer_size = bsk_buffer_len * sizeof(double);
bsk_gpu = cuda_malloc(bsk_gpu_buffer_size, gpu_idx);
CAPI_ASSERT_ERROR(
default_engine_discard_convert_lwe_bootstrap_key_to_lwe_bootstrap_key_mut_view_u64_raw_ptr_buffers(
default_engine, bsk, bsk_buffer));
cuda_initialize_twiddles(poly_size, gpu_idx);
cuda_convert_lwe_bootstrap_key_64(bsk_gpu, bsk_buffer, stream, gpu_idx,
input_lwe_dim, glwe_dim, level,
poly_size);
// This is currently not 100% async as we have to free CPU memory after
// conversion
cuda_synchronize_device(gpu_idx);
free(bsk_buffer);
return bsk_gpu;
}
void *get_ksk_gpu(uint32_t level, uint32_t input_lwe_dim,
uint32_t output_lwe_dim, uint32_t gpu_idx, void *stream) {
if (ksk_gpu != nullptr) {
return ksk_gpu;
}
const std::lock_guard<std::mutex> guard(ksk_gpu_mutex);
if (ksk_gpu != nullptr) {
return ksk_gpu;
}
LweKeyswitchKey64 *ksk = get_ksk();
size_t ksk_buffer_len = input_lwe_dim * (output_lwe_dim + 1) * level;
size_t ksk_buffer_size = sizeof(uint64_t) * ksk_buffer_len;
uint64_t *ksk_buffer =
(uint64_t *)aligned_alloc(U64_ALIGNMENT, ksk_buffer_size);
void *ksk_gpu = cuda_malloc(ksk_buffer_size, gpu_idx);
CAPI_ASSERT_ERROR(
default_engine_discard_convert_lwe_keyswitch_key_to_lwe_keyswitch_key_mut_view_u64_raw_ptr_buffers(
default_engine, ksk, ksk_buffer));
cuda_memcpy_async_to_gpu(ksk_gpu, ksk_buffer, ksk_buffer_size, stream,
gpu_idx);
// This is currently not 100% async as we have to free CPU memory after
// conversion
cuda_synchronize_device(gpu_idx);
free(ksk_buffer);
return ksk_gpu;
}
#endif
LweBootstrapKey64 *get_bsk() { return evaluationKeys.getBsk(); }
LweKeyswitchKey64 *get_ksk() { return evaluationKeys.getKsk(); }
@@ -102,6 +185,13 @@ private:
std::map<pthread_t, FftEngine *> fft_engines;
std::mutex engines_map_guard;
#ifdef CONCRETELANG_CUDA_SUPPORT
std::mutex bsk_gpu_mutex;
void *bsk_gpu;
std::mutex ksk_gpu_mutex;
void *ksk_gpu;
#endif
} RuntimeContext;
} // namespace concretelang

View File

@@ -56,17 +56,6 @@ void encode_and_expand_lut(uint64_t *output, size_t output_size,
#ifdef CONCRETELANG_CUDA_SUPPORT
// We need to define the double2 struct from the CUDA backend header files
// This shouldn't be defined here, but included along with concrete-cuda header
// files
typedef struct double2 {
double x, y;
} double2;
// From concrete-cuda
#include "bootstrap.h"
#include "device.h"
#include "keyswitch.h"
void memref_keyswitch_lwe_cuda_u64(
uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
@@ -116,47 +105,16 @@ void *memcpy_async_bsk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t input_lwe_dim, uint32_t poly_size,
uint32_t level, uint32_t glwe_dim,
uint32_t gpu_idx, void *stream) {
LweBootstrapKey64 *bsk = get_bootstrap_key_u64(context);
size_t bsk_buffer_len =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * poly_size * level;
size_t bsk_buffer_size = bsk_buffer_len * sizeof(uint64_t);
uint64_t *bsk_buffer =
(uint64_t *)aligned_alloc(U64_ALIGNMENT, bsk_buffer_size);
size_t fbsk_gpu_buffer_size = bsk_buffer_len * sizeof(double);
void *fbsk_gpu = cuda_malloc(fbsk_gpu_buffer_size, gpu_idx);
CAPI_ASSERT_ERROR(
default_engine_discard_convert_lwe_bootstrap_key_to_lwe_bootstrap_key_mut_view_u64_raw_ptr_buffers(
get_levelled_engine(), bsk, bsk_buffer));
cuda_initialize_twiddles(poly_size, gpu_idx);
cuda_convert_lwe_bootstrap_key_64(fbsk_gpu, bsk_buffer, stream, gpu_idx,
input_lwe_dim, glwe_dim, level, poly_size);
// This is currently not 100% async as we have to free CPU memory after
// conversion
cuda_synchronize_device(gpu_idx);
free(bsk_buffer);
return fbsk_gpu;
return context->get_bsk_gpu(input_lwe_dim, poly_size, level, glwe_dim,
gpu_idx, stream);
}
void *memcpy_async_ksk_to_gpu(mlir::concretelang::RuntimeContext *context,
uint32_t level, uint32_t input_lwe_dim,
uint32_t output_lwe_dim, uint32_t gpu_idx,
void *stream) {
LweKeyswitchKey64 *ksk = get_keyswitch_key_u64(context);
size_t ksk_buffer_len = input_lwe_dim * (output_lwe_dim + 1) * level;
size_t ksk_buffer_size = sizeof(uint64_t) * ksk_buffer_len;
uint64_t *ksk_buffer =
(uint64_t *)aligned_alloc(U64_ALIGNMENT, ksk_buffer_size);
void *ksk_gpu = cuda_malloc(ksk_buffer_size, gpu_idx);
CAPI_ASSERT_ERROR(
default_engine_discard_convert_lwe_keyswitch_key_to_lwe_keyswitch_key_mut_view_u64_raw_ptr_buffers(
get_levelled_engine(), ksk, ksk_buffer));
cuda_memcpy_async_to_gpu(ksk_gpu, ksk_buffer, ksk_buffer_size, stream,
gpu_idx);
// This is currently not 100% async as we have to free CPU memory after
// conversion
cuda_synchronize_device(gpu_idx);
free(ksk_buffer);
return ksk_gpu;
return context->get_ksk_gpu(level, input_lwe_dim, output_lwe_dim, gpu_idx,
stream);
}
void memcpy_async_ct_to_cpu(uint64_t *out_allocated, uint64_t *out_aligned,
@@ -229,7 +187,6 @@ void memref_bootstrap_lwe_cuda_u64(
out_stride, out_gpu, out_size, gpu_idx, stream);
cuda_synchronize_device(gpu_idx);
// free memory that we allocated on gpu
cuda_drop(fbsk_gpu, gpu_idx);
cuda_drop(ct0_gpu, gpu_idx);
cuda_drop(out_gpu, gpu_idx);
cuda_drop(test_vector_gpu, gpu_idx);