refactor(cuda): Refactor the low latency PBS.

2026-02-09 03:55:04 -05:00 · 2023-02-09 15:00:40 -03:00
parent 41be2b4832
commit db83fd7649
10 changed files with 165 additions and 170 deletions
--- a/src/bit_extraction.cu
+++ b/src/bit_extraction.cu
@@ -15,8 +15,6 @@ void cuda_extract_bits_32(
    uint32_t max_shared_memory) {
  assert(("Error (GPU extract bits): base log should be <= 32",
          base_log_bsk <= 32));
-  assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
          "512, 1024, 2048, 4096, 8192",
          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
@@ -44,8 +42,8 @@ void cuda_extract_bits_32(
        (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 1024:
    host_extract_bits<uint32_t, Degree<1024>>(
@@ -56,8 +54,8 @@ void cuda_extract_bits_32(
        (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 2048:
    host_extract_bits<uint32_t, Degree<2048>>(
@@ -68,8 +66,8 @@ void cuda_extract_bits_32(
        (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 4096:
    host_extract_bits<uint32_t, Degree<4096>>(
@@ -80,8 +78,8 @@ void cuda_extract_bits_32(
        (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 8192:
    host_extract_bits<uint32_t, Degree<8192>>(
@@ -92,8 +90,8 @@ void cuda_extract_bits_32(
        (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  default:
    break;
@@ -157,8 +155,6 @@ void cuda_extract_bits_64(
    uint32_t max_shared_memory) {
  assert(("Error (GPU extract bits): base log should be <= 64",
          base_log_bsk <= 64));
-  assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
          "512, 1024, 2048, 4096, 8192",
          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
@@ -186,8 +182,8 @@ void cuda_extract_bits_64(
        (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 1024:
    host_extract_bits<uint64_t, Degree<1024>>(
@@ -198,8 +194,8 @@ void cuda_extract_bits_64(
        (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 2048:
    host_extract_bits<uint64_t, Degree<2048>>(
@@ -210,8 +206,8 @@ void cuda_extract_bits_64(
        (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 4096:
    host_extract_bits<uint64_t, Degree<4096>>(
@@ -222,8 +218,8 @@ void cuda_extract_bits_64(
        (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  case 8192:
    host_extract_bits<uint64_t, Degree<8192>>(
@@ -234,8 +230,8 @@ void cuda_extract_bits_64(
        (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
        (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
        number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
    break;
  default:
    break;
--- a/src/bit_extraction.cuh
+++ b/src/bit_extraction.cuh
@@ -140,9 +140,9 @@ __host__ void host_extract_bits(
    Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs,
    uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk,
    uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk,
-    uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
-    uint32_t max_shared_memory) {
+    uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk,
+    uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
+    uint32_t number_of_samples, uint32_t max_shared_memory) {

  cudaSetDevice(gpu_index);
  auto stream = static_cast<cudaStream_t *>(v_stream);
@@ -189,8 +189,8 @@ __host__ void host_extract_bits(
    host_bootstrap_low_latency<Torus, params>(
        v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
        lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk,
-        lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
-        number_of_samples, 1, max_shared_memory);
+        glwe_dimension, lwe_dimension_out, lwe_dimension_in, base_log_bsk,
+        level_count_bsk, number_of_samples, 1, max_shared_memory);

    // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
    // of 0 if the extracted bit was 0 and 1 in the other case
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -16,8 +16,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(

  assert(("Error (GPU low latency PBS): base log should be <= 32",
          base_log <= 32));
-  assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
          "1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -39,36 +37,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
    host_bootstrap_low_latency<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 1024:
    host_bootstrap_low_latency<uint32_t, Degree<1024>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 2048:
    host_bootstrap_low_latency<uint32_t, Degree<2048>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 4096:
    host_bootstrap_low_latency<uint32_t, Degree<4096>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 8192:
    host_bootstrap_low_latency<uint32_t, Degree<8192>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  default:
    break;
@@ -160,8 +163,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(

  assert(("Error (GPU low latency PBS): base log should be <= 64",
          base_log <= 64));
-  assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
          "1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -183,36 +184,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
    host_bootstrap_low_latency<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 1024:
    host_bootstrap_low_latency<uint64_t, Degree<1024>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 2048:
    host_bootstrap_low_latency<uint64_t, Degree<2048>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 4096:
    host_bootstrap_low_latency<uint64_t, Degree<4096>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  case 8192:
    host_bootstrap_low_latency<uint64_t, Degree<8192>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
    break;
  default:
    break;
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -25,11 +25,11 @@ using namespace cooperative_groups;
 namespace cg = cooperative_groups;

 template <typename Torus, class params>
-__device__ void
-mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
-              double2 *body_join_buffer, double2 *bootstrapping_key,
-              int polynomial_size, int level_count, int iteration,
-              grid_group &grid) {
+__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
+                              double2 *join_buffer, double2 *bootstrapping_key,
+                              int polynomial_size, uint32_t glwe_dimension,
+                              int level_count, int iteration,
+                              grid_group &grid) {

  // Switch to the FFT space
  NSMFFT_direct<HalfDegree<params>>(fft);
@@ -40,52 +40,50 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
  // this function, so we end up getting the lines of the bootstrapping key
  // needed to perform the external product in this block (corresponding to
  // the same decomposition level)
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);

-  auto bsk_mask_slice =
-      get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
-                             blockIdx.x, polynomial_size, 1, level_count);
-  auto bsk_body_slice =
-      get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
-                             blockIdx.x, polynomial_size, 1, level_count);
+  // Selects all GLWEs in a particular decomposition level
+  auto level_join_buffer =
+      join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;

  // Perform the matrix multiplication between the GGSW and the GLWE,
  // each block operating on a single level for mask and body

-  auto first_processed_bsk =
-      (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
-  auto second_processed_bsk =
-      (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
-
-  auto first_processed_acc =
-      (blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x]
-                        : &body_join_buffer[params::degree / 2 * blockIdx.x];
-  auto second_processed_acc =
-      (blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x]
-                        : &mask_join_buffer[params::degree / 2 * blockIdx.x];
+  // The first product is used to initialize level_join_buffer
+  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+  auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;

  int tid = threadIdx.x;
-
-  // first product
  for (int i = 0; i < params::opt / 2; i++) {
-    first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid];
+    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
    tid += params::degree / params::opt;
  }

  grid.sync();
-  tid = threadIdx.x;
-  // second product
-  for (int i = 0; i < params::opt / 2; i++) {
-    second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid];
-    tid += params::degree / params::opt;
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  for (int j = 1; j < (glwe_dimension + 1); j++) {
+    int idx = (j + blockIdx.y) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto buffer_slice = level_join_buffer + idx * params::degree / 2;
+
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt / 2; i++) {
+      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
+      tid += params::degree / params::opt;
+    }
+    grid.sync();
  }

  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block

-  // All blocks are synchronized here; after this sync, *_join_buffer has the
-  // values needed from every other block
-  grid.sync();
-
-  auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
+  auto src_acc = join_buffer + blockIdx.y * params::degree / 2;

  // copy first product into fft buffer
  tid = threadIdx.x;
@@ -97,7 +95,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,

  // accumulate rest of the products into fft buffer
  for (int l = 1; l < gridDim.x; l++) {
-    auto cur_src_acc = &src_acc[l * params::degree / 2];
+    auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
    tid = threadIdx.x;
    for (int i = 0; i < params::opt / 2; i++) {
      fft[tid] += cur_src_acc[tid];
@@ -121,19 +119,22 @@ template <typename Torus, class params, sharedMemDegree SMD>
 /*
 * Kernel launched by the low latency version of the
 * bootstrapping, that uses cooperative groups
- * lwe_array_out vector of output lwe s, with length
- * (polynomial_size+1)*num_samples lut_vector - vector of look up tables with
- * length  polynomial_size * num_samples lut_vector_indexes - mapping between
- * lwe_array_in and lut_vector lwe_array_in
- * - vector of lwe inputs with length (lwe_dimension + 1) * num_samples
 *
+ * - lwe_array_out: vector of output lwe s, with length
+ * (glwe_dimension * polynomial_size+1)*num_samples
+ * - lut_vector: vector of look up tables with
+ * length  (glwe_dimension+1) * polynomial_size * num_samples
+ * - lut_vector_indexes: mapping between lwe_array_in and lut_vector
+ * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
+ * num_samples
+ *
+ * Each y-block computes one element of the lwe_array_out.
 */
 __global__ void device_bootstrap_low_latency(
    Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in,
-    double2 *bootstrapping_key, double2 *mask_join_buffer,
-    double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, char *device_mem,
-    int device_memory_size_per_block) {
+    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    char *device_mem, int device_memory_size_per_block) {

  grid_group grid = this_grid();

@@ -144,6 +145,7 @@ __global__ void device_bootstrap_low_latency(
  char *selected_memory;
  int block_index =
      blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
+  uint32_t glwe_dimension = gridDim.y - 1;

  if constexpr (SMD == FULLSM)
    selected_memory = sharedmem;
@@ -163,12 +165,12 @@ __global__ void device_bootstrap_low_latency(
  // this block is operating, in the case of batch bootstraps
  auto block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];

-  auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2];
+  auto block_lut_vector =
+      &lut_vector[blockIdx.z * params::degree * (glwe_dimension + 1)];

-  auto block_mask_join_buffer =
-      &mask_join_buffer[blockIdx.z * level_count * params::degree / 2];
-  auto block_body_join_buffer =
-      &body_join_buffer[blockIdx.z * level_count * params::degree / 2];
+  auto block_join_buffer =
+      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
+                   params::degree / 2];

  // Since the space is L1 cache is small, we use the same memory location for
  // the rotated accumulator and the fft accumulator, since we know that the
@@ -179,15 +181,10 @@ __global__ void device_bootstrap_low_latency(
  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
                        2 * params::degree);

-  if (blockIdx.y == 0) {
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, block_lut_vector, b_hat, false, 1);
-  } else {
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
-  }
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                        params::degree / params::opt>(
+      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+      false);

  for (int i = 0; i < lwe_dimension; i++) {
    synchronize_threads_in_block();
@@ -200,13 +197,13 @@ __global__ void device_bootstrap_low_latency(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat, 1);
+        accumulator, accumulator_rotated, a_hat);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count, 1);
+        accumulator_rotated, base_log, level_count);

    synchronize_threads_in_block();

@@ -214,29 +211,33 @@ __global__ void device_bootstrap_low_latency(
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated, 1);
+                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
    // accumulator_rotated, so we need to synchronize here to make sure they
    // don't modify the same memory space at the same time
    synchronize_threads_in_block();
+
    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
-                                 block_mask_join_buffer, block_body_join_buffer,
-                                 bootstrapping_key, polynomial_size,
-                                 level_count, i, grid);
+    mul_ggsw_glwe<Torus, params>(
+        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
+        polynomial_size, glwe_dimension, level_count, i, grid);
+
+    synchronize_threads_in_block();
  }

-  auto block_lwe_array_out = &lwe_array_out[blockIdx.z * (polynomial_size + 1)];
+  auto block_lwe_array_out =
+      &lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
+                     blockIdx.y * polynomial_size];

-  if (blockIdx.x == 0 && blockIdx.y == 0) {
+  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
    // Perform a sample extract. At this point, all blocks have the result, but
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
-  } else if (blockIdx.x == 0 && blockIdx.y == 1) {
-    block_lwe_array_out[params::degree] = accumulator[0];
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
  }
 }

@@ -248,7 +249,7 @@ template <typename Torus, class params>
 __host__ void host_bootstrap_low_latency(
    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
    uint32_t *lut_vector_indexes, Torus *lwe_array_in,
-    double2 *bootstrapping_key, uint32_t lwe_dimension,
+    double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
    uint32_t max_shared_memory) {
@@ -258,10 +259,8 @@ __host__ void host_bootstrap_low_latency(

  int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
                            polynomial_size / 2 * sizeof(double2);
-  double2 *mask_buffer_fft =
-      (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
-  double2 *body_buffer_fft =
-      (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
+  double2 *buffer_fft = (double2 *)cuda_malloc_async(
+      (glwe_dimension + 1) * buffer_size_per_gpu, stream, gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
@@ -279,35 +278,34 @@ __host__ void host_bootstrap_low_latency(
  char *d_mem;

  int thds = polynomial_size / params::opt;
-  dim3 grid(level_count, 2, input_lwe_ciphertext_count);
+  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

-  void *kernel_args[12];
+  void *kernel_args[11];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lut_vector;
  kernel_args[2] = &lwe_array_in;
  kernel_args[3] = &bootstrapping_key;
-  kernel_args[4] = &mask_buffer_fft;
-  kernel_args[5] = &body_buffer_fft;
-  kernel_args[6] = &lwe_dimension;
-  kernel_args[7] = &polynomial_size;
-  kernel_args[8] = &base_log;
-  kernel_args[9] = &level_count;
-  kernel_args[10] = &d_mem;
+  kernel_args[4] = &buffer_fft;
+  kernel_args[5] = &lwe_dimension;
+  kernel_args[6] = &polynomial_size;
+  kernel_args[7] = &base_log;
+  kernel_args[8] = &level_count;
+  kernel_args[9] = &d_mem;

  if (max_shared_memory < SM_PART) {
-    kernel_args[11] = &DM_FULL;
+    kernel_args[10] = &DM_FULL;
    check_cuda_error(cudaGetLastError());
    d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
-                                          level_count * 2,
+                                          level_count * (glwe_dimension + 1),
                                      stream, gpu_index);
    check_cuda_error(cudaGetLastError());
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
        (void **)kernel_args, 0, *stream));
  } else if (max_shared_memory < SM_FULL) {
-    kernel_args[11] = &DM_PART;
+    kernel_args[10] = &DM_PART;
    d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
-                                          level_count * 2,
+                                          level_count * (glwe_dimension + 1),
                                      stream, gpu_index);
    check_cuda_error(cudaFuncSetAttribute(
        device_bootstrap_low_latency<Torus, params, PARTIALSM>,
@@ -322,7 +320,7 @@ __host__ void host_bootstrap_low_latency(

  } else {
    int DM_NONE = 0;
-    kernel_args[11] = &DM_NONE;
+    kernel_args[10] = &DM_NONE;
    d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
    check_cuda_error(cudaFuncSetAttribute(
        device_bootstrap_low_latency<Torus, params, FULLSM>,
@@ -337,8 +335,7 @@ __host__ void host_bootstrap_low_latency(
  check_cuda_error(cudaGetLastError());
  // Synchronize the streams before copying the result to lwe_array_out at the
  // right place
-  cuda_drop_async(mask_buffer_fft, stream, gpu_index);
-  cuda_drop_async(body_buffer_fft, stream, gpu_index);
+  cuda_drop_async(buffer_fft, stream, gpu_index);
  cuda_drop_async(d_mem, stream, gpu_index);
 }

--- a/src/circuit_bootstrap.cu
+++ b/src/circuit_bootstrap.cu
@@ -13,8 +13,6 @@ void cuda_circuit_bootstrap_32(
    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
    uint32_t number_of_samples, uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
          "512, 1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -137,8 +135,6 @@ void cuda_circuit_bootstrap_64(
    uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
    uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
    uint32_t number_of_samples, uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
          "512, 1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
--- a/src/crypto/gadget.cuh
+++ b/src/crypto/gadget.cuh
@@ -12,6 +12,7 @@
 * arbitrary amount of levels by using decompose_and_compress_level().
 *
 * This class always decomposes the entire set of num_poly polynomials.
+ * By default, it works on a single polynomial.
 */
 #pragma once
 template <typename T, class params> class GadgetMatrix {
@@ -28,7 +29,7 @@ private:

 public:
  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
-                          uint32_t num_poly)
+                          uint32_t num_poly = 1)
      : base_log(base_log), level_count(level_count), num_poly(num_poly),
        state(state) {

--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -61,11 +61,13 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
 * Performs acc = acc * (X^ä + 1) if zeroAcc = false
 * Performs acc = 0 if zeroAcc
 * takes single buffer and calculates inplace.
+ *
+ *  By default, it works on a single polynomial.
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
                                                      uint32_t j, bool zeroAcc,
-                                                      uint32_t num_poly) {
+                                                      uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -105,10 +107,12 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
 *
 * Performs result_acc = acc * (X^ä - 1) - acc
 * takes single buffer as input and returns a single rotated buffer
+ *
+ *  By default, it works on a single polynomial.
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
-    T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
+    T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -138,11 +142,13 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
 /*
 * Receives num_poly  concatenated polynomials of type T. For each performs a
 * rounding to increase accuracy of the PBS. Calculates inplace.
+ *
+ *  By default, it works on a single polynomial.
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
                                                  int level_count,
-                                                  uint32_t num_poly) {
+                                                  uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,20 +198,21 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
  }
 }

-// Extracts the body of a GLWE with dimension glwe_dimension
+// Extracts the body of a GLWE.
+// k is the offset to find the body element / polynomial in the lwe_array_out /
+// accumulator
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t glwe_dimension) {
+                                    uint32_t k) {
  // Set first coefficient of the accumulator as the body of the LWE sample
-  lwe_array_out[glwe_dimension * params::degree] =
-      accumulator[glwe_dimension * params::degree];
+  lwe_array_out[k * params::degree] = accumulator[k * params::degree];
 }

-// Extracts the mask of a GLWE with dimension glwe_dimension
+// Extracts the mask from num_poly polynomials individually
 template <typename Torus, class params>
 __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t glwe_dimension) {
-  for (int z = 0; z < glwe_dimension; z++) {
+                                    uint32_t num_poly = 1) {
+  for (int z = 0; z < num_poly; z++) {
    Torus *lwe_array_out_slice =
        (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
    Torus *accumulator_slice =
--- a/src/vertical_packing.cu
+++ b/src/vertical_packing.cu
@@ -17,8 +17,6 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
              polynomial_size == 2048 || polynomial_size == 4096 ||
              polynomial_size == 8192));
  // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
          "be >= 1 ",
          r >= 1));
@@ -99,8 +97,6 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
              polynomial_size == 2048 || polynomial_size == 4096 ||
              polynomial_size == 8192));
  // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
          "be >= 1 ",
          r >= 1));
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -35,8 +35,6 @@ void cuda_circuit_bootstrap_vertical_packing_64(
    uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
    uint32_t number_of_inputs, uint32_t lut_number,
    uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
          "512, 1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -149,8 +147,6 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                     uint32_t number_of_bits_of_message_including_padding,
                     uint32_t number_of_bits_to_extract,
                     uint32_t number_of_inputs, uint32_t max_shared_memory) {
-  assert(("Error (GPU WOP PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
          "512, 1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
--- a/src/wop_bootstrap.cuh
+++ b/src/wop_bootstrap.cuh
@@ -174,8 +174,8 @@ __host__ void host_wop_pbs(
      lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer,
      lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk,
      number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
-      base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-      number_of_inputs, max_shared_memory);
+      glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+      level_count_ksk, number_of_inputs, max_shared_memory);
  check_cuda_error(cudaGetLastError());
  cuda_drop_async(lut_pbs, stream, gpu_index);
  cuda_drop_async(lut_vector_indexes, stream, gpu_index);