From 3a679a6f0a607f1ecef113a5633ab43cc5d0d0d5 Mon Sep 17 00:00:00 2001
From: Antoniu Pop <antoniu.pop@zama.ai>
Date: Tue, 11 Apr 2023 14:00:49 +0100
Subject: [PATCH] feat(compiler): add mapped version of batched bootstrap
 wrappers for CPU and GPU.

---
 .../include/concretelang/Runtime/wrappers.h   |  23 ++++
 .../compiler/lib/Runtime/wrappers.cpp         | 121 ++++++++++++++++++
 2 files changed, 144 insertions(+)

diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/wrappers.h b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/wrappers.h
index cd9bb0d22..4d69fdd1e 100644
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/wrappers.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/wrappers.h
@@ -169,6 +169,18 @@ void memref_batched_bootstrap_lwe_u64(
     uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
     mlir::concretelang::RuntimeContext *context);
 
+void memref_batched_mapped_bootstrap_lwe_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
+    uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size0,
+    uint64_t tlu_size1, uint64_t tlu_stride0, uint64_t tlu_stride1,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
+    mlir::concretelang::RuntimeContext *context);
+
 void *memref_bootstrap_async_lwe_u64(
     uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
     uint64_t out_size, uint64_t out_stride, uint64_t *ct0_allocated,
@@ -271,6 +283,17 @@ void memref_batched_bootstrap_lwe_cuda_u64(
     uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
     mlir::concretelang::RuntimeContext *context);
 
+void memref_batched_mapped_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
+    uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size0,
+    uint64_t tlu_size1, uint64_t tlu_stride0, uint64_t tlu_stride1,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
+    mlir::concretelang::RuntimeContext *context);
 // Tracing ////////////////////////////////////////////////////////////////////
 void memref_trace_ciphertext(uint64_t *ct0_allocated, uint64_t *ct0_aligned,
                              uint64_t ct0_offset, uint64_t ct0_size,
diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
index 8e43a3d9c..6058d7dc0 100644
--- a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
@@ -233,6 +233,104 @@ void memref_batched_bootstrap_lwe_cuda_u64(
   cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
 }
 
+void memref_batched_mapped_bootstrap_lwe_cuda_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
+    uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size0,
+    uint64_t tlu_size1, uint64_t tlu_stride0, uint64_t tlu_stride1,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
+    mlir::concretelang::RuntimeContext *context) {
+  assert(bsk_index == 0 && "multiple bsk is not yet implemented on GPU");
+  assert(out_size0 == ct0_size0);
+  assert(out_size1 == glwe_dim * poly_size + 1);
+  assert((out_size0 == tlu_size0 || tlu_size0 == 1) && "Number of LUTs does not match batch size");
+  // TODO: Multi GPU
+  uint32_t gpu_idx = 0;
+  uint32_t num_samples = out_size0;
+  uint32_t num_lut_vectors = tlu_size0;
+  uint64_t ct0_batch_size = ct0_size0 * ct0_size1;
+  uint64_t out_batch_size = out_size0 * out_size1;
+  int8_t *pbs_buffer = nullptr;
+
+  // Create the cuda stream
+  // TODO: Should be created by the compiler codegen
+  void *stream = cuda_create_stream(gpu_idx);
+  // Get the pointer on the bootstraping key on the GPU
+  void *fbsk_gpu = memcpy_async_bsk_to_gpu(context, input_lwe_dim, poly_size,
+                                           level, glwe_dim, gpu_idx, stream);
+  // Move the input and output batch of ciphertext to the GPU
+  // TODO: The allocation should be done by the compiler codegen
+  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
+  void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
+                                    (cudaStream_t *)stream, gpu_idx);
+  // Construct the glwe accumulator (on CPU)
+  // TODO: Should be done outside of the bootstrap call, compile time if
+  // possible. Refactor in progress
+  uint64_t glwe_ct_size = poly_size * (glwe_dim + 1) * num_lut_vectors;
+  uint64_t *glwe_ct = (uint64_t *)malloc(glwe_ct_size * sizeof(uint64_t));
+  auto tlu = tlu_aligned + tlu_offset;
+
+  // Glwe trivial encryption
+  size_t pos = 0, postlu = 0;
+  for (size_t l = 0; l < num_lut_vectors; ++l) {
+    for (size_t i = 0; i < poly_size * glwe_dim; i++) {
+      glwe_ct[pos++] = 0;
+    }
+    for (size_t i = 0; i < poly_size; i++) {
+      glwe_ct[pos++] = tlu[postlu++];
+    }
+  }
+
+  // Move the glwe accumulator to the GPU
+  void *glwe_ct_gpu = alloc_and_memcpy_async_to_gpu(
+      glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t *)stream);
+
+  // Move test vector indexes to the GPU, the test vector indexes is set of 0
+  uint32_t lwe_idx = 0,
+           test_vector_idxes_size = num_samples * sizeof(uint64_t);
+  uint64_t *test_vector_idxes = (uint64_t *)malloc(test_vector_idxes_size);
+  if (num_lut_vectors == 1) {
+    memset((void *)test_vector_idxes, 0, test_vector_idxes_size);
+  } else {
+    assert(num_lut_vectors == num_samples);
+    for (size_t i = 0; i < num_lut_vectors; ++i)
+      test_vector_idxes[i] = i;
+  }
+  void *test_vector_idxes_gpu = cuda_malloc_async(
+      test_vector_idxes_size, (cudaStream_t *)stream, gpu_idx);
+  cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
+                           test_vector_idxes_size, (cudaStream_t *)stream,
+                           gpu_idx);
+  // Allocate PBS buffer on GPU
+  scratch_cuda_bootstrap_amortized_64(
+      stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
+      cuda_get_max_shared_memory(gpu_idx), true);
+  // Run the bootstrap kernel on the GPU
+  cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+      stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
+      fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
+      num_samples, num_lut_vectors, lwe_idx,
+      cuda_get_max_shared_memory(gpu_idx));
+  cleanup_cuda_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
+  // Copy the output batch of ciphertext back to CPU
+  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
+                      stream);
+  // free memory that we allocated on gpu
+  cuda_drop_async(ct0_gpu, (cudaStream_t *)stream, gpu_idx);
+  cuda_drop_async(out_gpu, (cudaStream_t *)stream, gpu_idx);
+  cuda_drop_async(glwe_ct_gpu, (cudaStream_t *)stream, gpu_idx);
+  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)stream, gpu_idx);
+  cudaStreamSynchronize(*(cudaStream_t *)stream);
+  // Free the glwe accumulator (on CPU)
+  free(glwe_ct);
+  cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
+}
+
 #endif
 
 void memref_encode_plaintext_with_crt(
@@ -696,6 +794,29 @@ void memref_batched_bootstrap_lwe_u64(
   }
 }
 
+void memref_batched_mapped_bootstrap_lwe_u64(
+    uint64_t *out_allocated, uint64_t *out_aligned, uint64_t out_offset,
+    uint64_t out_size0, uint64_t out_size1, uint64_t out_stride0,
+    uint64_t out_stride1, uint64_t *ct0_allocated, uint64_t *ct0_aligned,
+    uint64_t ct0_offset, uint64_t ct0_size0, uint64_t ct0_size1,
+    uint64_t ct0_stride0, uint64_t ct0_stride1, uint64_t *tlu_allocated,
+    uint64_t *tlu_aligned, uint64_t tlu_offset, uint64_t tlu_size0,
+    uint64_t tlu_size1, uint64_t tlu_stride0, uint64_t tlu_stride1,
+    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
+    uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
+    mlir::concretelang::RuntimeContext *context) {
+  assert(out_size0 == tlu_size0 && "Number of LUTs does not match batch size");
+  for (size_t i = 0; i < out_size0; i++) {
+    memref_bootstrap_lwe_u64(
+        out_allocated + i * out_size1, out_aligned + i * out_size1, out_offset,
+        out_size1, out_stride1, ct0_allocated, ct0_aligned + i * ct0_size1,
+        ct0_offset, ct0_size1, ct0_stride1, tlu_allocated,
+        tlu_aligned + i * tlu_size1, tlu_offset, tlu_size1, tlu_stride1,
+        input_lwe_dim, poly_size, level, base_log, glwe_dim, bsk_index,
+        context);
+  }
+}
+
 uint64_t encode_crt(int64_t plaintext, uint64_t modulus, uint64_t product) {
   return concretelang::clientlib::crt::encode(plaintext, modulus, product);
 }