diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu
index 612f5c33b..ead137428 100644
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -13,8 +13,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
 
   assert(
       ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
-  assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
           "1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
@@ -26,36 +24,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
     host_bootstrap_amortized<uint32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 1024:
     host_bootstrap_amortized<uint32_t, Degree<1024>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 2048:
     host_bootstrap_amortized<uint32_t, Degree<2048>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 4096:
     host_bootstrap_amortized<uint32_t, Degree<4096>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 8192:
     host_bootstrap_amortized<uint32_t, Degree<8192>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   default:
     break;
@@ -136,8 +139,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
 
   assert(
       ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
-  assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
           "1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
@@ -149,36 +150,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
     host_bootstrap_amortized<uint64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 1024:
     host_bootstrap_amortized<uint64_t, Degree<1024>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 2048:
     host_bootstrap_amortized<uint64_t, Degree<2048>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 4096:
     host_bootstrap_amortized<uint64_t, Degree<4096>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   case 8192:
     host_bootstrap_amortized<uint64_t, Degree<8192>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
     break;
   default:
     break;
diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh
index c937f3c32..4e13dc1c4 100644
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -53,8 +53,8 @@ template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_bootstrap_amortized(
     Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes,
     Torus *lwe_array_in, double2 *bootstrapping_key, char *device_mem,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t lwe_idx,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
     size_t device_memory_size_per_sample) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -69,26 +69,22 @@ __global__ void device_bootstrap_amortized(
 
   // For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
   // one mask polynomial and 1 body to handle.
-  Torus *accumulator_mask = (Torus *)selected_memory;
-  Torus *accumulator_body =
-      (Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
-  Torus *accumulator_mask_rotated =
-      (Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
-  Torus *accumulator_body_rotated =
-      (Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
-  double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
-                          polynomial_size / (sizeof(double2) / sizeof(Torus));
-  double2 *body_res_fft =
-      (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
+  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated =
+      (Torus *)accumulator +
+      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
+  double2 *res_fft =
+      (double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
+                                           (sizeof(double2) / sizeof(Torus));
   double2 *accumulator_fft = (double2 *)sharedmem;
   if constexpr (SMD != PARTIALSM)
-    accumulator_fft =
-        (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
+    accumulator_fft = (double2 *)res_fft +
+                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
 
   auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
   Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
-                  2];
+                  (glwe_dimension + 1)];
 
   // Put "b", the body, in [0, 2N[
   Torus b_hat = 0;
@@ -97,11 +93,7 @@ __global__ void device_bootstrap_amortized(
 
   divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                         params::degree / params::opt>(
-      accumulator_mask, block_lut_vector, b_hat, false);
-
-  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                        params::degree / params::opt>(
-      accumulator_body, &block_lut_vector[params::degree], b_hat, false);
+      accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
 
   // Loop over all the mask elements of the sample to accumulate
   // (X^a_i-1) multiplication, decomposition of the resulting polynomial
@@ -118,11 +110,7 @@ __global__ void device_bootstrap_amortized(
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
         Torus, params::opt, params::degree / params::opt>(
-        accumulator_mask, accumulator_mask_rotated, a_hat);
-
-    multiply_by_monomial_negacyclic_and_sub_polynomial<
-        Torus, params::opt, params::degree / params::opt>(
-        accumulator_body, accumulator_body_rotated, a_hat);
+        accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
 
     synchronize_threads_in_block();
 
@@ -130,153 +118,122 @@ __global__ void device_bootstrap_amortized(
     // bootstrapped ciphertext
     round_to_closest_multiple_inplace<Torus, params::opt,
                                       params::degree / params::opt>(
-        accumulator_mask_rotated, base_log, level_count);
+        accumulator_rotated, base_log, level_count, glwe_dimension + 1);
 
-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
-        accumulator_body_rotated, base_log, level_count);
     // Initialize the polynomial multiplication via FFT arrays
     // The polynomial multiplications happens at the block level
     // and each thread handles two or more coefficients
     int pos = threadIdx.x;
-    for (int j = 0; j < params::opt / 2; j++) {
-      mask_res_fft[pos].x = 0;
-      mask_res_fft[pos].y = 0;
-      body_res_fft[pos].x = 0;
-      body_res_fft[pos].y = 0;
+    for (int j = 0; j < (glwe_dimension + 1) * params::opt / 2; j++) {
+      res_fft[pos].x = 0;
+      res_fft[pos].y = 0;
       pos += params::degree / params::opt;
     }
 
-    GadgetMatrix<Torus, params> gadget_mask(base_log, level_count,
-                                            accumulator_mask_rotated);
-    GadgetMatrix<Torus, params> gadget_body(base_log, level_count,
-                                            accumulator_body_rotated);
-
+    GadgetMatrix<Torus, params> gadget(base_log, level_count,
+                                       accumulator_rotated, glwe_dimension + 1);
     // Now that the rotation is done, decompose the resulting polynomial
     // coefficients so as to multiply each decomposed level with the
     // corresponding part of the bootstrapping key
     for (int level = level_count - 1; level >= 0; level--) {
+      gadget.decompose_and_compress_next(accumulator_fft);
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_fft_slice = accumulator_fft + i * params::degree / 2;
 
-      gadget_mask.decompose_and_compress_next(accumulator_fft);
-      // Switch to the FFT space
-      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+        // Switch to the FFT space
+        NSMFFT_direct<HalfDegree<params>>(accumulator_fft_slice);
+        synchronize_threads_in_block();
+
+        // Get the bootstrapping key piece necessary for the multiplication
+        // It is already in the Fourier domain
+        auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
+                                                level, polynomial_size,
+                                                glwe_dimension, level_count);
+        synchronize_threads_in_block();
+
+        // Perform the coefficient-wise product with the two pieces of
+        // bootstrapping key
+        for (int j = 0; j < (glwe_dimension + 1); j++) {
+          auto bsk_poly = bsk_slice + j * params::degree / 2;
+          auto res_fft_poly = res_fft + j * params::degree / 2;
+          polynomial_product_accumulate_in_fourier_domain<params, double2>(
+              res_fft_poly, accumulator_fft_slice, bsk_poly);
+        }
+      }
       synchronize_threads_in_block();
-
-      // Get the bootstrapping key piece necessary for the multiplication
-      // It is already in the Fourier domain
-      auto bsk_mask_slice =
-          get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level,
-                                 polynomial_size, 1, level_count);
-      auto bsk_body_slice =
-          get_ith_body_kth_block(bootstrapping_key, iteration, 0, level,
-                                 polynomial_size, 1, level_count);
-
-      synchronize_threads_in_block();
-
-      // Perform the coefficient-wise product with the two pieces of
-      // bootstrapping key
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          mask_res_fft, accumulator_fft, bsk_mask_slice);
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          body_res_fft, accumulator_fft, bsk_body_slice);
-
-      synchronize_threads_in_block();
-      gadget_body.decompose_and_compress_next(accumulator_fft);
-
-      // Now handle the polynomial multiplication for the body
-      // in the same way
-
-      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
-      synchronize_threads_in_block();
-
-      auto bsk_mask_slice_2 =
-          get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level,
-                                 polynomial_size, 1, level_count);
-      auto bsk_body_slice_2 =
-          get_ith_body_kth_block(bootstrapping_key, iteration, 1, level,
-                                 polynomial_size, 1, level_count);
-
-      synchronize_threads_in_block();
-
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          mask_res_fft, accumulator_fft, bsk_mask_slice_2);
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          body_res_fft, accumulator_fft, bsk_body_slice_2);
     }
 
     // Come back to the coefficient representation
     if constexpr (SMD == FULLSM || SMD == NOSM) {
       synchronize_threads_in_block();
 
-      NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
-      NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
-
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
+      }
       synchronize_threads_in_block();
 
-      add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
-      add_to_torus<Torus, params>(body_res_fft, accumulator_body);
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
+      }
       synchronize_threads_in_block();
-
     } else {
-      int tid = threadIdx.x;
 #pragma unroll
-      for (int i = 0; i < params::opt / 2; i++) {
-        accumulator_fft[tid] = mask_res_fft[tid];
-        tid = tid + params::degree / params::opt;
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        int tid = threadIdx.x;
+        for (int i = 0; i < params::opt / 2; i++) {
+          accumulator_fft[tid] = res_fft_slice[tid];
+          tid = tid + params::degree / params::opt;
+        }
+        synchronize_threads_in_block();
+
+        NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+        synchronize_threads_in_block();
+
+        add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
       }
       synchronize_threads_in_block();
-
-      NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-      synchronize_threads_in_block();
-
-      add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
-      synchronize_threads_in_block();
-
-      tid = threadIdx.x;
-#pragma unroll
-      for (int i = 0; i < params::opt / 2; i++) {
-        accumulator_fft[tid] = body_res_fft[tid];
-        tid = tid + params::degree / params::opt;
-      }
-      synchronize_threads_in_block();
-
-      NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-      synchronize_threads_in_block();
-
-      add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
-      synchronize_threads_in_block();
     }
   }
 
-  auto block_lwe_array_out = &lwe_array_out[blockIdx.x * (polynomial_size + 1)];
+  auto block_lwe_array_out =
+      &lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)];
 
   // The blind rotation for this block is over
   // Now we can perform the sample extraction: for the body it's just
   // the resulting constant coefficient of the accumulator
   // For the mask it's more complicated
-  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator_mask);
-  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator_body);
+  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
+  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
 }
 
 template <typename Torus, class params>
 __host__ void host_bootstrap_amortized(
     void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
     uint32_t *lut_vector_indexes, Torus *lwe_array_in,
-    double2 *bootstrapping_key, uint32_t input_lwe_dimension,
+    double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
     uint32_t lwe_idx, uint32_t max_shared_memory) {
 
   cudaSetDevice(gpu_index);
-  int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
-                sizeof(Torus) * polynomial_size + // accumulator body
-                sizeof(Torus) * polynomial_size + // accumulator mask rotated
-                sizeof(Torus) * polynomial_size + // accumulator body rotated
-                sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
-                sizeof(double2) * polynomial_size / 2 + // accumulator fft body
-                sizeof(double2) * polynomial_size / 2;  // calculate buffer fft
+  int SM_FULL =
+      sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
+      sizeof(Torus) * polynomial_size *
+          (glwe_dimension + 1) + // accumulator rotated
+      sizeof(double2) * polynomial_size / 2 *
+          (glwe_dimension + 1) + // accumulator fft
+      sizeof(double2) * polynomial_size / 2 *
+          (glwe_dimension + 1); // calculate buffer fft
 
-  int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
+  int SM_PART = sizeof(double2) * polynomial_size / 2 *
+                (glwe_dimension + 1); // calculate buffer fft
 
   int DM_PART = SM_FULL - SM_PART;
 
@@ -304,8 +261,8 @@ __host__ void host_bootstrap_amortized(
                                       stream, gpu_index);
     device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
         lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-        bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
-        base_log, level_count, lwe_idx, DM_FULL);
+        bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
   } else if (max_shared_memory < SM_FULL) {
     cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
                          cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
@@ -316,8 +273,8 @@ __host__ void host_bootstrap_amortized(
     device_bootstrap_amortized<Torus, params, PARTIALSM>
         <<<grid, thds, SM_PART, *stream>>>(
             lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
-            base_log, level_count, lwe_idx, DM_PART);
+            bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+            polynomial_size, base_log, level_count, lwe_idx, DM_PART);
   } else {
     // For devices with compute capability 7.x a single thread block can
     // address the full capacity of shared memory. Shared memory on the
@@ -335,11 +292,10 @@ __host__ void host_bootstrap_amortized(
     device_bootstrap_amortized<Torus, params, FULLSM>
         <<<grid, thds, SM_FULL, *stream>>>(
             lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
-            base_log, level_count, lwe_idx, 0);
+            bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
+            polynomial_size, base_log, level_count, lwe_idx, 0);
   }
   check_cuda_error(cudaGetLastError());
-
   cuda_drop_async(d_mem, stream, gpu_index);
 }
 
diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh
index 877950b1f..693884cf2 100644
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -182,11 +182,11 @@ __global__ void device_bootstrap_low_latency(
   if (blockIdx.y == 0) {
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
-        accumulator, block_lut_vector, b_hat, false);
+        accumulator, block_lut_vector, b_hat, false, 1);
   } else {
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
-        accumulator, &block_lut_vector[params::degree], b_hat, false);
+        accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
   }
 
   for (int i = 0; i < lwe_dimension; i++) {
@@ -200,13 +200,13 @@ __global__ void device_bootstrap_low_latency(
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
         Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat);
+        accumulator, accumulator_rotated, a_hat, 1);
 
     // Perform a rounding to increase the accuracy of the
     // bootstrapped ciphertext
     round_to_closest_multiple_inplace<Torus, params::opt,
                                       params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count);
+        accumulator_rotated, base_log, level_count, 1);
 
     synchronize_threads_in_block();
 
@@ -214,7 +214,7 @@ __global__ void device_bootstrap_low_latency(
     // decomposition, for the mask and the body (so block 0 will have the
     // accumulator decomposed at level 0, 1 at 1, etc.)
     GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated);
+                                           accumulator_rotated, 1);
     gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
 
     // We are using the same memory space for accumulator_fft and
@@ -234,9 +234,9 @@ __global__ void device_bootstrap_low_latency(
     // Perform a sample extract. At this point, all blocks have the result, but
     // we do the computation at block 0 to avoid waiting for extra blocks, in
     // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
   } else if (blockIdx.x == 0 && blockIdx.y == 1) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator);
+    block_lwe_array_out[params::degree] = accumulator[0];
   }
 }
 
diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh
index dbca600a2..9bb713000 100644
--- a/src/circuit_bootstrap.cuh
+++ b/src/circuit_bootstrap.cuh
@@ -144,8 +144,8 @@ __host__ void host_circuit_bootstrap(
   host_bootstrap_amortized<Torus, params>(
       v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
       lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
-      lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count,
-      level_cbs, 0, max_shared_memory);
+      glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
+      pbs_count, level_cbs, 0, max_shared_memory);
 
   dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1);
   dim3 copy_block(params::degree / params::opt, 1, 1);
diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh
index 71e40f7ce..4ed497e4b 100644
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -15,6 +15,7 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
          level_count;
 }
 
+////////////////////////////////////////////////
 template <typename T>
 __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                      uint32_t polynomial_size,
@@ -35,8 +36,9 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
               level * polynomial_size / 2 * (glwe_dimension + 1) *
                   (glwe_dimension + 1) +
               k * polynomial_size / 2 * (glwe_dimension + 1) +
-              polynomial_size / 2];
+              glwe_dimension * polynomial_size / 2];
 }
+////////////////////////////////////////////////
 
 template <typename T, typename ST>
 void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
diff --git a/src/crypto/gadget.cuh b/src/crypto/gadget.cuh
index cd0918239..863288884 100644
--- a/src/crypto/gadget.cuh
+++ b/src/crypto/gadget.cuh
@@ -4,6 +4,15 @@
 #include "polynomial/polynomial.cuh"
 #include <cstdint>
 
+/**
+ * GadgetMatrix implements the iterator design pattern to decompose a set of
+ * num_poly consecutive polynomials with degree params::degree. A total of
+ * level_count levels is expected and each call to decompose_and_compress_next()
+ * writes to the result the next level. It is also possible to advance an
+ * arbitrary amount of levels by using decompose_and_compress_level().
+ *
+ * This class always decomposes the entire set of num_poly polynomials.
+ */
 #pragma once
 template <typename T, class params> class GadgetMatrix {
 private:
@@ -11,19 +20,22 @@ private:
   uint32_t base_log;
   uint32_t mask;
   uint32_t halfbg;
+  uint32_t num_poly;
   T offset;
   int current_level;
   T mask_mod_b;
   T *state;
 
 public:
-  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state)
-      : base_log(base_log), level_count(level_count), state(state) {
+  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
+                          uint32_t num_poly)
+      : base_log(base_log), level_count(level_count), num_poly(num_poly),
+        state(state) {
 
     mask_mod_b = (1ll << base_log) - 1ll;
     current_level = level_count;
     int tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
+    for (int i = 0; i < num_poly * params::opt; i++) {
       state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
       tid += params::degree / params::opt;
     }
@@ -31,26 +43,31 @@ public:
   }
 
   __device__ void decompose_and_compress_next(double2 *result) {
-    int tid = threadIdx.x;
     current_level -= 1;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state[tid] & mask_mod_b;
-      T res_im = state[tid + params::degree / 2] & mask_mod_b;
-      state[tid] >>= base_log;
-      state[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state[tid]) & res_re;
-      T carry_im = ((res_im - 1ll) | state[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state[tid] += carry_re;
-      state[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
+    for (int j = 0; j < num_poly; j++) {
+      int tid = threadIdx.x;
+      auto state_slice = state + j * params::degree;
+      auto result_slice = result + j * params::degree / 2;
+      for (int i = 0; i < params::opt / 2; i++) {
+        T res_re = state_slice[tid] & mask_mod_b;
+        T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
+        state_slice[tid] >>= base_log;
+        state_slice[tid + params::degree / 2] >>= base_log;
+        T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
+        T carry_im =
+            ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
+        carry_re >>= (base_log - 1);
+        carry_im >>= (base_log - 1);
+        state_slice[tid] += carry_re;
+        state_slice[tid + params::degree / 2] += carry_im;
+        res_re -= carry_re << base_log;
+        res_im -= carry_im << base_log;
 
-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
+        result_slice[tid].x = (int32_t)res_re;
+        result_slice[tid].y = (int32_t)res_im;
 
-      tid += params::degree / params::opt;
+        tid += params::degree / params::opt;
+      }
     }
     synchronize_threads_in_block();
   }
diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh
index 6cc1e49aa..1076bf755 100644
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -56,36 +56,78 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
 }
 
 /*
+ * Receives num_poly  concatenated polynomials of type T. For each:
+ *
  * Performs acc = acc * (X^ä + 1) if zeroAcc = false
  * Performs acc = 0 if zeroAcc
  * takes single buffer and calculates inplace.
  */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
-                                                      uint32_t j,
-                                                      bool zeroAcc) {
-  int tid = threadIdx.x;
+                                                      uint32_t j, bool zeroAcc,
+                                                      uint32_t num_poly) {
   constexpr int degree = block_size * elems_per_thread;
-  if (zeroAcc) {
-    for (int i = 0; i < elems_per_thread; i++) {
-      accumulator[tid] = 0;
-      tid += block_size;
+  for (int z = 0; z < num_poly; z++) {
+    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
+    T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
+
+    int tid = threadIdx.x;
+    if (zeroAcc) {
+      for (int i = 0; i < elems_per_thread; i++) {
+        accumulator_slice[tid] = 0;
+        tid += block_size;
+      }
+    } else {
+      tid = threadIdx.x;
+      for (int i = 0; i < elems_per_thread; i++) {
+        if (j < degree) {
+          if (tid < degree - j) {
+            accumulator_slice[tid] = input_slice[tid + j];
+          } else {
+            accumulator_slice[tid] = -input_slice[tid - degree + j];
+          }
+        } else {
+          uint32_t jj = j - degree;
+          if (tid < degree - jj) {
+            accumulator_slice[tid] = -input_slice[tid + jj];
+          } else {
+            accumulator_slice[tid] = input_slice[tid - degree + jj];
+          }
+        }
+        tid += block_size;
+      }
     }
-  } else {
-    tid = threadIdx.x;
+  }
+}
+
+/*
+ * Receives num_poly  concatenated polynomials of type T. For each:
+ *
+ * Performs result_acc = acc * (X^ä - 1) - acc
+ * takes single buffer as input and returns a single rotated buffer
+ */
+template <typename T, int elems_per_thread, int block_size>
+__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
+    T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
+  constexpr int degree = block_size * elems_per_thread;
+  for (int z = 0; z < num_poly; z++) {
+    T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
+    T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
+    int tid = threadIdx.x;
     for (int i = 0; i < elems_per_thread; i++) {
       if (j < degree) {
-        if (tid < degree - j) {
-          accumulator[tid] = input[tid + j];
+        if (tid < j) {
+          result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
         } else {
-          accumulator[tid] = -input[tid - degree + j];
+          result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
         }
       } else {
         uint32_t jj = j - degree;
-        if (tid < degree - jj) {
-          accumulator[tid] = -input[tid + jj];
+        if (tid < jj) {
+          result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
+
         } else {
-          accumulator[tid] = input[tid - degree + jj];
+          result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
         }
       }
       tid += block_size;
@@ -94,54 +136,28 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
 }
 
 /*
- * Performs result_acc = acc * (X^ä - 1) - acc
- * takes single buffer as input returns single rotated buffer
- */
-template <typename T, int elems_per_thread, int block_size>
-__device__ void
-multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
-                                                   uint32_t j) {
-  int tid = threadIdx.x;
-  constexpr int degree = block_size * elems_per_thread;
-  for (int i = 0; i < elems_per_thread; i++) {
-    if (j < degree) {
-      if (tid < j) {
-        result_acc[tid] = -acc[tid - j + degree] - acc[tid];
-      } else {
-        result_acc[tid] = acc[tid - j] - acc[tid];
-      }
-    } else {
-      uint32_t jj = j - degree;
-      if (tid < jj) {
-        result_acc[tid] = acc[tid - jj + degree] - acc[tid];
-
-      } else {
-        result_acc[tid] = -acc[tid - jj] - acc[tid];
-      }
-    }
-    tid += block_size;
-  }
-}
-
-/*
- * performs a rounding to increase accuracy of the PBS
- * calculates inplace.
+ * Receives num_poly  concatenated polynomials of type T. For each performs a
+ * rounding to increase accuracy of the PBS. Calculates inplace.
  */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
-                                                  int level_count) {
-  int tid = threadIdx.x;
-  for (int i = 0; i < elems_per_thread; i++) {
-
-    T x_acc = rotated_acc[tid];
-    T shift = sizeof(T) * 8 - level_count * base_log;
-    T mask = 1ll << (shift - 1);
-    T b_acc = (x_acc & mask) >> (shift - 1);
-    T res_acc = x_acc >> shift;
-    res_acc += b_acc;
-    res_acc <<= shift;
-    rotated_acc[tid] = res_acc;
-    tid = tid + block_size;
+                                                  int level_count,
+                                                  uint32_t num_poly) {
+  constexpr int degree = block_size * elems_per_thread;
+  for (int z = 0; z < num_poly; z++) {
+    T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
+    int tid = threadIdx.x;
+    for (int i = 0; i < elems_per_thread; i++) {
+      T x_acc = rotated_acc_slice[tid];
+      T shift = sizeof(T) * 8 - level_count * base_log;
+      T mask = 1ll << (shift - 1);
+      T b_acc = (x_acc & mask) >> (shift - 1);
+      T res_acc = x_acc >> shift;
+      res_acc += b_acc;
+      res_acc <<= shift;
+      rotated_acc_slice[tid] = res_acc;
+      tid = tid + block_size;
+    }
   }
 }
 
@@ -176,80 +192,85 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
   }
 }
 
+// Extracts the body of a GLWE with dimension glwe_dimension
 template <typename Torus, class params>
-__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator) {
+__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
+                                    uint32_t glwe_dimension) {
   // Set first coefficient of the accumulator as the body of the LWE sample
-  lwe_array_out[params::degree] = accumulator[0];
+  lwe_array_out[glwe_dimension * params::degree] =
+      accumulator[glwe_dimension * params::degree];
 }
 
+// Extracts the mask of a GLWE with dimension glwe_dimension
 template <typename Torus, class params>
-__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator) {
-  // Set ACC = -ACC
-  // accumulator.negate_inplace();
+__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
+                                    uint32_t glwe_dimension) {
+  for (int z = 0; z < glwe_dimension; z++) {
+    Torus *lwe_array_out_slice =
+        (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
+    Torus *accumulator_slice =
+        (Torus *)accumulator + (ptrdiff_t)(z * params::degree);
 
-  int tid = threadIdx.x;
+    // Set ACC = -ACC
+    int tid = threadIdx.x;
 #pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = -accumulator[tid];
-    tid = tid + params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // Reverse the accumulator
-  // accumulator.reverse_inplace();
-
-  tid = threadIdx.x;
-  Torus result[params::opt];
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    result[i] = accumulator[params::degree - tid - 1];
-    tid = tid + params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = result[i];
-    tid = tid + params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // Perform ACC * X
-  // accumulator.multiply_by_monomial_negacyclic_inplace(1);
-
-  tid = threadIdx.x;
-  result[params::opt];
-  for (int i = 0; i < params::opt; i++) {
-    if (1 < params::degree) {
-      if (tid < 1)
-        result[i] = -accumulator[tid - 1 + params::degree];
-      else
-        result[i] = accumulator[tid - 1];
-    } else {
-      uint32_t jj = 1 - (uint32_t)params::degree;
-      if (tid < jj)
-        result[i] = accumulator[tid - jj + params::degree];
-      else
-        result[i] = -accumulator[tid - jj];
+    for (int i = 0; i < params::opt; i++) {
+      accumulator_slice[tid] = -accumulator_slice[tid];
+      tid = tid + params::degree / params::opt;
     }
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = result[i];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
+    synchronize_threads_in_block();
 
-  // Copy to the mask of the LWE sample
-  // accumulator.copy_into(lwe_array_out);
-
-  tid = threadIdx.x;
+    // Reverse the accumulator
+    tid = threadIdx.x;
+    Torus result[params::opt];
 #pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    lwe_array_out[tid] = accumulator[tid];
-    tid = tid + params::degree / params::opt;
+    for (int i = 0; i < params::opt; i++) {
+      result[i] = accumulator_slice[params::degree - tid - 1];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      accumulator_slice[tid] = result[i];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+
+    // Perform ACC * X
+    // (equivalent to multiply_by_monomial_negacyclic_inplace(1))
+    tid = threadIdx.x;
+    result[params::opt];
+    for (int i = 0; i < params::opt; i++) {
+      if (1 < params::degree) {
+        if (tid < 1)
+          result[i] = -accumulator_slice[tid - 1 + params::degree];
+        else
+          result[i] = accumulator_slice[tid - 1];
+      } else {
+        uint32_t jj = 1 - (uint32_t)params::degree;
+        if (tid < jj)
+          result[i] = accumulator_slice[tid - jj + params::degree];
+        else
+          result[i] = -accumulator_slice[tid - jj];
+      }
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      accumulator_slice[tid] = result[i];
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+
+    // Copy to the mask of the LWE sample
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      lwe_array_out_slice[tid] = accumulator_slice[tid];
+      tid = tid + params::degree / params::opt;
+    }
   }
 }
 
diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh
index 917273ecd..bb9b5a22b 100644
--- a/src/vertical_packing.cuh
+++ b/src/vertical_packing.cuh
@@ -101,8 +101,10 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
     pos += params::degree / params::opt;
   }
 
-  GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask);
-  GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body);
+  GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask,
+                                          1);
+  GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body,
+                                          1);
   // Subtract each glwe operand, decompose the resulting
   // polynomial coefficients to multiply each decomposed level
   // with the corresponding part of the LUT
@@ -420,12 +422,12 @@ __global__ void device_blind_rotation_and_sample_extraction(
     // Body
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
-        accumulator_c1, accumulator_c0, (1 << monomial_degree), false);
+        accumulator_c1, accumulator_c0, (1 << monomial_degree), false, 1);
     // Mask
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
         accumulator_c1 + polynomial_size, accumulator_c0 + polynomial_size,
-        (1 << monomial_degree), false);
+        (1 << monomial_degree), false, 1);
 
     monomial_degree += 1;
 
@@ -445,9 +447,8 @@ __global__ void device_blind_rotation_and_sample_extraction(
   // Now we can perform the sample extraction: for the body it's just
   // the resulting constant coefficient of the accumulator
   // For the mask it's more complicated
-  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0);
-  sample_extract_body<Torus, params>(block_lwe_out,
-                                     accumulator_c0 + polynomial_size);
+  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0, 1);
+  sample_extract_body<Torus, params>(block_lwe_out, accumulator_c0, 1);
 }
 
 template <typename Torus, typename STorus, class params>