refactor(cuda): Refactor the amortized PBS.

2026-02-08 11:35:02 -05:00 · 2023-02-06 14:45:22 -03:00
parent 2a299664e7
commit 41be2b4832
8 changed files with 321 additions and 318 deletions
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -13,8 +13,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(

  assert(
      ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
-  assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
          "1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -26,36 +24,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
    host_bootstrap_amortized<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 1024:
    host_bootstrap_amortized<uint32_t, Degree<1024>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 2048:
    host_bootstrap_amortized<uint32_t, Degree<2048>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 4096:
    host_bootstrap_amortized<uint32_t, Degree<4096>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 8192:
    host_bootstrap_amortized<uint32_t, Degree<8192>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  default:
    break;
@@ -136,8 +139,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(

  assert(
      ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
-  assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
          "1024, 2048, 4096, 8192",
          polynomial_size == 512 || polynomial_size == 1024 ||
@@ -149,36 +150,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
    host_bootstrap_amortized<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 1024:
    host_bootstrap_amortized<uint64_t, Degree<1024>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 2048:
    host_bootstrap_amortized<uint64_t, Degree<2048>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 4096:
    host_bootstrap_amortized<uint64_t, Degree<4096>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  case 8192:
    host_bootstrap_amortized<uint64_t, Degree<8192>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
    break;
  default:
    break;
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -53,8 +53,8 @@ template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_bootstrap_amortized(
    Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes,
    Torus *lwe_array_in, double2 *bootstrapping_key, char *device_mem,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_idx,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
    size_t device_memory_size_per_sample) {
  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -69,26 +69,22 @@ __global__ void device_bootstrap_amortized(

  // For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
  // one mask polynomial and 1 body to handle.
-  Torus *accumulator_mask = (Torus *)selected_memory;
-  Torus *accumulator_body =
-      (Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
-  Torus *accumulator_mask_rotated =
-      (Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
-  Torus *accumulator_body_rotated =
-      (Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
-  double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
-                          polynomial_size / (sizeof(double2) / sizeof(Torus));
-  double2 *body_res_fft =
-      (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
+  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated =
+      (Torus *)accumulator +
+      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
+  double2 *res_fft =
+      (double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
+                                           (sizeof(double2) / sizeof(Torus));
  double2 *accumulator_fft = (double2 *)sharedmem;
  if constexpr (SMD != PARTIALSM)
-    accumulator_fft =
-        (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
+    accumulator_fft = (double2 *)res_fft +
+                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);

  auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
  Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
-                  2];
+                  (glwe_dimension + 1)];

  // Put "b", the body, in [0, 2N[
  Torus b_hat = 0;
@@ -97,11 +93,7 @@ __global__ void device_bootstrap_amortized(

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
-      accumulator_mask, block_lut_vector, b_hat, false);
-
-  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                        params::degree / params::opt>(
-      accumulator_body, &block_lut_vector[params::degree], b_hat, false);
+      accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);

  // Loop over all the mask elements of the sample to accumulate
  // (X^a_i-1) multiplication, decomposition of the resulting polynomial
@@ -118,11 +110,7 @@ __global__ void device_bootstrap_amortized(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator_mask, accumulator_mask_rotated, a_hat);
-
-    multiply_by_monomial_negacyclic_and_sub_polynomial<
-        Torus, params::opt, params::degree / params::opt>(
-        accumulator_body, accumulator_body_rotated, a_hat);
+        accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);

    synchronize_threads_in_block();

@@ -130,153 +118,122 @@ __global__ void device_bootstrap_amortized(
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_mask_rotated, base_log, level_count);
+        accumulator_rotated, base_log, level_count, glwe_dimension + 1);

-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
-        accumulator_body_rotated, base_log, level_count);
    // Initialize the polynomial multiplication via FFT arrays
    // The polynomial multiplications happens at the block level
    // and each thread handles two or more coefficients
    int pos = threadIdx.x;
-    for (int j = 0; j < params::opt / 2; j++) {
-      mask_res_fft[pos].x = 0;
-      mask_res_fft[pos].y = 0;
-      body_res_fft[pos].x = 0;
-      body_res_fft[pos].y = 0;
+    for (int j = 0; j < (glwe_dimension + 1) * params::opt / 2; j++) {
+      res_fft[pos].x = 0;
+      res_fft[pos].y = 0;
      pos += params::degree / params::opt;
    }

-    GadgetMatrix<Torus, params> gadget_mask(base_log, level_count,
-                                            accumulator_mask_rotated);
-    GadgetMatrix<Torus, params> gadget_body(base_log, level_count,
-                                            accumulator_body_rotated);
-
+    GadgetMatrix<Torus, params> gadget(base_log, level_count,
+                                       accumulator_rotated, glwe_dimension + 1);
    // Now that the rotation is done, decompose the resulting polynomial
    // coefficients so as to multiply each decomposed level with the
    // corresponding part of the bootstrapping key
    for (int level = level_count - 1; level >= 0; level--) {
+      gadget.decompose_and_compress_next(accumulator_fft);
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_fft_slice = accumulator_fft + i * params::degree / 2;

-      gadget_mask.decompose_and_compress_next(accumulator_fft);
-      // Switch to the FFT space
-      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+        // Switch to the FFT space
+        NSMFFT_direct<HalfDegree<params>>(accumulator_fft_slice);
+        synchronize_threads_in_block();
+
+        // Get the bootstrapping key piece necessary for the multiplication
+        // It is already in the Fourier domain
+        auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
+                                                level, polynomial_size,
+                                                glwe_dimension, level_count);
+        synchronize_threads_in_block();
+
+        // Perform the coefficient-wise product with the two pieces of
+        // bootstrapping key
+        for (int j = 0; j < (glwe_dimension + 1); j++) {
+          auto bsk_poly = bsk_slice + j * params::degree / 2;
+          auto res_fft_poly = res_fft + j * params::degree / 2;
+          polynomial_product_accumulate_in_fourier_domain<params, double2>(
+              res_fft_poly, accumulator_fft_slice, bsk_poly);
+        }
+      }
      synchronize_threads_in_block();
-
-      // Get the bootstrapping key piece necessary for the multiplication
-      // It is already in the Fourier domain
-      auto bsk_mask_slice =
-          get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level,
-                                 polynomial_size, 1, level_count);
-      auto bsk_body_slice =
-          get_ith_body_kth_block(bootstrapping_key, iteration, 0, level,
-                                 polynomial_size, 1, level_count);
-
-      synchronize_threads_in_block();
-
-      // Perform the coefficient-wise product with the two pieces of
-      // bootstrapping key
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          mask_res_fft, accumulator_fft, bsk_mask_slice);
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          body_res_fft, accumulator_fft, bsk_body_slice);
-
-      synchronize_threads_in_block();
-      gadget_body.decompose_and_compress_next(accumulator_fft);
-
-      // Now handle the polynomial multiplication for the body
-      // in the same way
-
-      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
-      synchronize_threads_in_block();
-
-      auto bsk_mask_slice_2 =
-          get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level,
-                                 polynomial_size, 1, level_count);
-      auto bsk_body_slice_2 =
-          get_ith_body_kth_block(bootstrapping_key, iteration, 1, level,
-                                 polynomial_size, 1, level_count);
-
-      synchronize_threads_in_block();
-
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          mask_res_fft, accumulator_fft, bsk_mask_slice_2);
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          body_res_fft, accumulator_fft, bsk_body_slice_2);
    }

    // Come back to the coefficient representation
    if constexpr (SMD == FULLSM || SMD == NOSM) {
      synchronize_threads_in_block();

-      NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
-      NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
-
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
+      }
      synchronize_threads_in_block();

-      add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
-      add_to_torus<Torus, params>(body_res_fft, accumulator_body);
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
+      }
      synchronize_threads_in_block();
-
    } else {
-      int tid = threadIdx.x;
 #pragma unroll
-      for (int i = 0; i < params::opt / 2; i++) {
-        accumulator_fft[tid] = mask_res_fft[tid];
-        tid = tid + params::degree / params::opt;
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        int tid = threadIdx.x;
+        for (int i = 0; i < params::opt / 2; i++) {
+          accumulator_fft[tid] = res_fft_slice[tid];
+          tid = tid + params::degree / params::opt;
+        }
+        synchronize_threads_in_block();
+
+        NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+        synchronize_threads_in_block();
+
+        add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
      }
      synchronize_threads_in_block();
-
-      NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-      synchronize_threads_in_block();
-
-      add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
-      synchronize_threads_in_block();
-
-      tid = threadIdx.x;
-#pragma unroll
-      for (int i = 0; i < params::opt / 2; i++) {
-        accumulator_fft[tid] = body_res_fft[tid];
-        tid = tid + params::degree / params::opt;
-      }
-      synchronize_threads_in_block();
-
-      NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-      synchronize_threads_in_block();
-
-      add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
-      synchronize_threads_in_block();
    }
  }

-  auto block_lwe_array_out = &lwe_array_out[blockIdx.x * (polynomial_size + 1)];
+  auto block_lwe_array_out =
+      &lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)];

  // The blind rotation for this block is over
  // Now we can perform the sample extraction: for the body it's just
  // the resulting constant coefficient of the accumulator
  // For the mask it's more complicated
-  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator_mask);
-  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator_body);
+  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
+  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
 }

 template <typename Torus, class params>
 __host__ void host_bootstrap_amortized(
    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
    uint32_t *lut_vector_indexes, Torus *lwe_array_in,
-    double2 *bootstrapping_key, uint32_t input_lwe_dimension,
+    double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
    uint32_t lwe_idx, uint32_t max_shared_memory) {

  cudaSetDevice(gpu_index);
-  int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
-                sizeof(Torus) * polynomial_size + // accumulator body
-                sizeof(Torus) * polynomial_size + // accumulator mask rotated
-                sizeof(Torus) * polynomial_size + // accumulator body rotated
-                sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
-                sizeof(double2) * polynomial_size / 2 + // accumulator fft body
-                sizeof(double2) * polynomial_size / 2;  // calculate buffer fft
+  int SM_FULL =
+      sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
+      sizeof(Torus) * polynomial_size *
+          (glwe_dimension + 1) + // accumulator rotated
+      sizeof(double2) * polynomial_size / 2 *
+          (glwe_dimension + 1) + // accumulator fft
+      sizeof(double2) * polynomial_size / 2 *
+          (glwe_dimension + 1); // calculate buffer fft

-  int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
+  int SM_PART = sizeof(double2) * polynomial_size / 2 *
+                (glwe_dimension + 1); // calculate buffer fft

  int DM_PART = SM_FULL - SM_PART;

@@ -304,8 +261,8 @@ __host__ void host_bootstrap_amortized(
                                      stream, gpu_index);
    device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
        lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-        bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
-        base_log, level_count, lwe_idx, DM_FULL);
+        bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
  } else if (max_shared_memory < SM_FULL) {
    cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
                         cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
@@ -316,8 +273,8 @@ __host__ void host_bootstrap_amortized(
    device_bootstrap_amortized<Torus, params, PARTIALSM>
        <<<grid, thds, SM_PART, *stream>>>(
            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
-            base_log, level_count, lwe_idx, DM_PART);
+            bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+            polynomial_size, base_log, level_count, lwe_idx, DM_PART);
  } else {
    // For devices with compute capability 7.x a single thread block can
    // address the full capacity of shared memory. Shared memory on the
@@ -335,11 +292,10 @@ __host__ void host_bootstrap_amortized(
    device_bootstrap_amortized<Torus, params, FULLSM>
        <<<grid, thds, SM_FULL, *stream>>>(
            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
-            base_log, level_count, lwe_idx, 0);
+            bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+            polynomial_size, base_log, level_count, lwe_idx, 0);
  }
  check_cuda_error(cudaGetLastError());
-
  cuda_drop_async(d_mem, stream, gpu_index);
 }

--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -182,11 +182,11 @@ __global__ void device_bootstrap_low_latency(
  if (blockIdx.y == 0) {
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
-        accumulator, block_lut_vector, b_hat, false);
+        accumulator, block_lut_vector, b_hat, false, 1);
  } else {
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[params::degree], b_hat, false);
+        accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
  }

  for (int i = 0; i < lwe_dimension; i++) {
@@ -200,13 +200,13 @@ __global__ void device_bootstrap_low_latency(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat);
+        accumulator, accumulator_rotated, a_hat, 1);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count);
+        accumulator_rotated, base_log, level_count, 1);

    synchronize_threads_in_block();

@@ -214,7 +214,7 @@ __global__ void device_bootstrap_low_latency(
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated);
+                                           accumulator_rotated, 1);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -234,9 +234,9 @@ __global__ void device_bootstrap_low_latency(
    // Perform a sample extract. At this point, all blocks have the result, but
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
  } else if (blockIdx.x == 0 && blockIdx.y == 1) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator);
+    block_lwe_array_out[params::degree] = accumulator[0];
  }
 }

--- a/src/circuit_bootstrap.cuh
+++ b/src/circuit_bootstrap.cuh
@@ -144,8 +144,8 @@ __host__ void host_circuit_bootstrap(
  host_bootstrap_amortized<Torus, params>(
      v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
      lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
-      lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count,
-      level_cbs, 0, max_shared_memory);
+      glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
+      pbs_count, level_cbs, 0, max_shared_memory);

  dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1);
  dim3 copy_block(params::degree / params::opt, 1, 1);
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -15,6 +15,7 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
         level_count;
 }

+////////////////////////////////////////////////
 template <typename T>
 __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
@@ -35,8 +36,9 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
              level * polynomial_size / 2 * (glwe_dimension + 1) *
                  (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1) +
-              polynomial_size / 2];
+              glwe_dimension * polynomial_size / 2];
 }
+////////////////////////////////////////////////

 template <typename T, typename ST>
 void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
--- a/src/crypto/gadget.cuh
+++ b/src/crypto/gadget.cuh
@@ -4,6 +4,15 @@
 #include "polynomial/polynomial.cuh"
 #include <cstdint>

+/**
+ * GadgetMatrix implements the iterator design pattern to decompose a set of
+ * num_poly consecutive polynomials with degree params::degree. A total of
+ * level_count levels is expected and each call to decompose_and_compress_next()
+ * writes to the result the next level. It is also possible to advance an
+ * arbitrary amount of levels by using decompose_and_compress_level().
+ *
+ * This class always decomposes the entire set of num_poly polynomials.
+ */
 #pragma once
 template <typename T, class params> class GadgetMatrix {
 private:
@@ -11,19 +20,22 @@ private:
  uint32_t base_log;
  uint32_t mask;
  uint32_t halfbg;
+  uint32_t num_poly;
  T offset;
  int current_level;
  T mask_mod_b;
  T *state;

 public:
-  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state)
-      : base_log(base_log), level_count(level_count), state(state) {
+  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
+                          uint32_t num_poly)
+      : base_log(base_log), level_count(level_count), num_poly(num_poly),
+        state(state) {

    mask_mod_b = (1ll << base_log) - 1ll;
    current_level = level_count;
    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
+    for (int i = 0; i < num_poly * params::opt; i++) {
      state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
      tid += params::degree / params::opt;
    }
@@ -31,26 +43,31 @@ public:
  }

  __device__ void decompose_and_compress_next(double2 *result) {
-    int tid = threadIdx.x;
    current_level -= 1;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state[tid] & mask_mod_b;
-      T res_im = state[tid + params::degree / 2] & mask_mod_b;
-      state[tid] >>= base_log;
-      state[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state[tid]) & res_re;
-      T carry_im = ((res_im - 1ll) | state[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state[tid] += carry_re;
-      state[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
+    for (int j = 0; j < num_poly; j++) {
+      int tid = threadIdx.x;
+      auto state_slice = state + j * params::degree;
+      auto result_slice = result + j * params::degree / 2;
+      for (int i = 0; i < params::opt / 2; i++) {
+        T res_re = state_slice[tid] & mask_mod_b;
+        T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
+        state_slice[tid] >>= base_log;
+        state_slice[tid + params::degree / 2] >>= base_log;
+        T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
+        T carry_im =
+            ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
+        carry_re >>= (base_log - 1);
+        carry_im >>= (base_log - 1);
+        state_slice[tid] += carry_re;
+        state_slice[tid + params::degree / 2] += carry_im;
+        res_re -= carry_re << base_log;
+        res_im -= carry_im << base_log;

-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
+        result_slice[tid].x = (int32_t)res_re;
+        result_slice[tid].y = (int32_t)res_im;

-      tid += params::degree / params::opt;
+        tid += params::degree / params::opt;
+      }
    }
    synchronize_threads_in_block();
  }
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -56,36 +56,78 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
 }

 /*
+ * Receives num_poly  concatenated polynomials of type T. For each:
+ *
 * Performs acc = acc * (X^ä + 1) if zeroAcc = false
 * Performs acc = 0 if zeroAcc
 * takes single buffer and calculates inplace.
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
-                                                      uint32_t j,
-                                                      bool zeroAcc) {
-  int tid = threadIdx.x;
+                                                      uint32_t j, bool zeroAcc,
+                                                      uint32_t num_poly) {
  constexpr int degree = block_size * elems_per_thread;
-  if (zeroAcc) {
-    for (int i = 0; i < elems_per_thread; i++) {
-      accumulator[tid] = 0;
-      tid += block_size;
+  for (int z = 0; z < num_poly; z++) {
+    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
+    T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
+
+    int tid = threadIdx.x;
+    if (zeroAcc) {
+      for (int i = 0; i < elems_per_thread; i++) {
+        accumulator_slice[tid] = 0;
+        tid += block_size;
+      }
+    } else {
+      tid = threadIdx.x;
+      for (int i = 0; i < elems_per_thread; i++) {
+        if (j < degree) {
+          if (tid < degree - j) {
+            accumulator_slice[tid] = input_slice[tid + j];
+          } else {
+            accumulator_slice[tid] = -input_slice[tid - degree + j];
+          }
+        } else {
+          uint32_t jj = j - degree;
+          if (tid < degree - jj) {
+            accumulator_slice[tid] = -input_slice[tid + jj];
+          } else {
+            accumulator_slice[tid] = input_slice[tid - degree + jj];
+          }
+        }
+        tid += block_size;
+      }
    }
-  } else {
-    tid = threadIdx.x;
+  }
+}
+
+/*
+ * Receives num_poly  concatenated polynomials of type T. For each:
+ *
+ * Performs result_acc = acc * (X^ä - 1) - acc
+ * takes single buffer as input and returns a single rotated buffer
+ */
+template <typename T, int elems_per_thread, int block_size>
+__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
+    T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
+  constexpr int degree = block_size * elems_per_thread;
+  for (int z = 0; z < num_poly; z++) {
+    T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
+    T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
+    int tid = threadIdx.x;
    for (int i = 0; i < elems_per_thread; i++) {
      if (j < degree) {
-        if (tid < degree - j) {
-          accumulator[tid] = input[tid + j];
+        if (tid < j) {
+          result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
        } else {
-          accumulator[tid] = -input[tid - degree + j];
+          result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
        }
      } else {
        uint32_t jj = j - degree;
-        if (tid < degree - jj) {
-          accumulator[tid] = -input[tid + jj];
+        if (tid < jj) {
+          result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
+
        } else {
-          accumulator[tid] = input[tid - degree + jj];
+          result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
        }
      }
      tid += block_size;
@@ -94,54 +136,28 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
 }

 /*
- * Performs result_acc = acc * (X^ä - 1) - acc
- * takes single buffer as input returns single rotated buffer
- */
-template <typename T, int elems_per_thread, int block_size>
-__device__ void
-multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
-                                                   uint32_t j) {
-  int tid = threadIdx.x;
-  constexpr int degree = block_size * elems_per_thread;
-  for (int i = 0; i < elems_per_thread; i++) {
-    if (j < degree) {
-      if (tid < j) {
-        result_acc[tid] = -acc[tid - j + degree] - acc[tid];
-      } else {
-        result_acc[tid] = acc[tid - j] - acc[tid];
-      }
-    } else {
-      uint32_t jj = j - degree;
-      if (tid < jj) {
-        result_acc[tid] = acc[tid - jj + degree] - acc[tid];
-
-      } else {
-        result_acc[tid] = -acc[tid - jj] - acc[tid];
-      }
-    }
-    tid += block_size;
-  }
-}
-
-/*
- * performs a rounding to increase accuracy of the PBS
- * calculates inplace.
+ * Receives num_poly  concatenated polynomials of type T. For each performs a
+ * rounding to increase accuracy of the PBS. Calculates inplace.
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
-                                                  int level_count) {
-  int tid = threadIdx.x;
-  for (int i = 0; i < elems_per_thread; i++) {
-
-    T x_acc = rotated_acc[tid];
-    T shift = sizeof(T) * 8 - level_count * base_log;
-    T mask = 1ll << (shift - 1);
-    T b_acc = (x_acc & mask) >> (shift - 1);
-    T res_acc = x_acc >> shift;
-    res_acc += b_acc;
-    res_acc <<= shift;
-    rotated_acc[tid] = res_acc;
-    tid = tid + block_size;
+                                                  int level_count,
+                                                  uint32_t num_poly) {
+  constexpr int degree = block_size * elems_per_thread;
+  for (int z = 0; z < num_poly; z++) {
+    T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
+    int tid = threadIdx.x;
+    for (int i = 0; i < elems_per_thread; i++) {
+      T x_acc = rotated_acc_slice[tid];
+      T shift = sizeof(T) * 8 - level_count * base_log;
+      T mask = 1ll << (shift - 1);
+      T b_acc = (x_acc & mask) >> (shift - 1);
+      T res_acc = x_acc >> shift;
+      res_acc += b_acc;
+      res_acc <<= shift;
+      rotated_acc_slice[tid] = res_acc;
+      tid = tid + block_size;
+    }
  }
 }

@@ -176,80 +192,85 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
  }
 }

+// Extracts the body of a GLWE with dimension glwe_dimension
 template <typename Torus, class params>
-__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator) {
+__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
+                                    uint32_t glwe_dimension) {
  // Set first coefficient of the accumulator as the body of the LWE sample
-  lwe_array_out[params::degree] = accumulator[0];
+  lwe_array_out[glwe_dimension * params::degree] =
+      accumulator[glwe_dimension * params::degree];
 }

+// Extracts the mask of a GLWE with dimension glwe_dimension
 template <typename Torus, class params>
-__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator) {
-  // Set ACC = -ACC
-  // accumulator.negate_inplace();
+__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
+                                    uint32_t glwe_dimension) {
+  for (int z = 0; z < glwe_dimension; z++) {
+    Torus *lwe_array_out_slice =
+        (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
+    Torus *accumulator_slice =
+        (Torus *)accumulator + (ptrdiff_t)(z * params::degree);

-  int tid = threadIdx.x;
+    // Set ACC = -ACC
+    int tid = threadIdx.x;
 #pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = -accumulator[tid];
-    tid = tid + params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // Reverse the accumulator
-  // accumulator.reverse_inplace();
-
-  tid = threadIdx.x;
-  Torus result[params::opt];
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    result[i] = accumulator[params::degree - tid - 1];
-    tid = tid + params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = result[i];
-    tid = tid + params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // Perform ACC * X
-  // accumulator.multiply_by_monomial_negacyclic_inplace(1);
-
-  tid = threadIdx.x;
-  result[params::opt];
-  for (int i = 0; i < params::opt; i++) {
-    if (1 < params::degree) {
-      if (tid < 1)
-        result[i] = -accumulator[tid - 1 + params::degree];
-      else
-        result[i] = accumulator[tid - 1];
-    } else {
-      uint32_t jj = 1 - (uint32_t)params::degree;
-      if (tid < jj)
-        result[i] = accumulator[tid - jj + params::degree];
-      else
-        result[i] = -accumulator[tid - jj];
+    for (int i = 0; i < params::opt; i++) {
+      accumulator_slice[tid] = -accumulator_slice[tid];
+      tid = tid + params::degree / params::opt;
    }
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = result[i];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
+    synchronize_threads_in_block();

-  // Copy to the mask of the LWE sample
-  // accumulator.copy_into(lwe_array_out);
-
-  tid = threadIdx.x;
+    // Reverse the accumulator
+    tid = threadIdx.x;
+    Torus result[params::opt];
 #pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    lwe_array_out[tid] = accumulator[tid];
-    tid = tid + params::degree / params::opt;
+    for (int i = 0; i < params::opt; i++) {
+      result[i] = accumulator_slice[params::degree - tid - 1];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      accumulator_slice[tid] = result[i];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+
+    // Perform ACC * X
+    // (equivalent to multiply_by_monomial_negacyclic_inplace(1))
+    tid = threadIdx.x;
+    result[params::opt];
+    for (int i = 0; i < params::opt; i++) {
+      if (1 < params::degree) {
+        if (tid < 1)
+          result[i] = -accumulator_slice[tid - 1 + params::degree];
+        else
+          result[i] = accumulator_slice[tid - 1];
+      } else {
+        uint32_t jj = 1 - (uint32_t)params::degree;
+        if (tid < jj)
+          result[i] = accumulator_slice[tid - jj + params::degree];
+        else
+          result[i] = -accumulator_slice[tid - jj];
+      }
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      accumulator_slice[tid] = result[i];
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+
+    // Copy to the mask of the LWE sample
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      lwe_array_out_slice[tid] = accumulator_slice[tid];
+      tid = tid + params::degree / params::opt;
+    }
  }
 }

--- a/src/vertical_packing.cuh
+++ b/src/vertical_packing.cuh
@@ -101,8 +101,10 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
    pos += params::degree / params::opt;
  }

-  GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask);
-  GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body);
+  GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask,
+                                          1);
+  GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body,
+                                          1);
  // Subtract each glwe operand, decompose the resulting
  // polynomial coefficients to multiply each decomposed level
  // with the corresponding part of the LUT
@@ -420,12 +422,12 @@ __global__ void device_blind_rotation_and_sample_extraction(
    // Body
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
-        accumulator_c1, accumulator_c0, (1 << monomial_degree), false);
+        accumulator_c1, accumulator_c0, (1 << monomial_degree), false, 1);
    // Mask
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator_c1 + polynomial_size, accumulator_c0 + polynomial_size,
-        (1 << monomial_degree), false);
+        (1 << monomial_degree), false, 1);

    monomial_degree += 1;

@@ -445,9 +447,8 @@ __global__ void device_blind_rotation_and_sample_extraction(
  // Now we can perform the sample extraction: for the body it's just
  // the resulting constant coefficient of the accumulator
  // For the mask it's more complicated
-  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0);
-  sample_extract_body<Torus, params>(block_lwe_out,
-                                     accumulator_c0 + polynomial_size);
+  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0, 1);
+  sample_extract_body<Torus, params>(block_lwe_out, accumulator_c0, 1);
 }

 template <typename Torus, typename STorus, class params>