diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu
index 5d9f29065..6b0fe8b53 100644
--- a/src/bit_extraction.cu
+++ b/src/bit_extraction.cu
@@ -15,8 +15,6 @@ void cuda_extract_bits_32(
     uint32_t max_shared_memory) {
   assert(("Error (GPU extract bits): base log should be <= 32",
           base_log_bsk <= 32));
-  assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
           "512, 1024, 2048, 4096, 8192",
           lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
@@ -44,8 +42,8 @@ void cuda_extract_bits_32(
         (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 1024:
     host_extract_bits<uint32_t, Degree<1024>>(
@@ -56,8 +54,8 @@ void cuda_extract_bits_32(
         (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 2048:
     host_extract_bits<uint32_t, Degree<2048>>(
@@ -68,8 +66,8 @@ void cuda_extract_bits_32(
         (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 4096:
     host_extract_bits<uint32_t, Degree<4096>>(
@@ -80,8 +78,8 @@ void cuda_extract_bits_32(
         (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 8192:
     host_extract_bits<uint32_t, Degree<8192>>(
@@ -92,8 +90,8 @@ void cuda_extract_bits_32(
         (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   default:
     break;
@@ -157,8 +155,6 @@ void cuda_extract_bits_64(
     uint32_t max_shared_memory) {
   assert(("Error (GPU extract bits): base log should be <= 64",
           base_log_bsk <= 64));
-  assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
           "512, 1024, 2048, 4096, 8192",
           lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
@@ -186,8 +182,8 @@ void cuda_extract_bits_64(
         (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 1024:
     host_extract_bits<uint64_t, Degree<1024>>(
@@ -198,8 +194,8 @@ void cuda_extract_bits_64(
         (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 2048:
     host_extract_bits<uint64_t, Degree<2048>>(
@@ -210,8 +206,8 @@ void cuda_extract_bits_64(
         (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 4096:
     host_extract_bits<uint64_t, Degree<4096>>(
@@ -222,8 +218,8 @@ void cuda_extract_bits_64(
         (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   case 8192:
     host_extract_bits<uint64_t, Degree<8192>>(
@@ -234,8 +230,8 @@ void cuda_extract_bits_64(
         (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
         (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
         number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        number_of_samples, max_shared_memory);
+        glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, number_of_samples, max_shared_memory);
     break;
   default:
     break;
diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh
index f68dfab3b..4bb0aa9a8 100644
--- a/src/bit_extraction.cuh
+++ b/src/bit_extraction.cuh
@@ -140,9 +140,9 @@ __host__ void host_extract_bits(
     Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs,
     uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk,
     uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk,
-    uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
-    uint32_t max_shared_memory) {
+    uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk,
+    uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
+    uint32_t number_of_samples, uint32_t max_shared_memory) {
 
   cudaSetDevice(gpu_index);
   auto stream = static_cast<cudaStream_t *>(v_stream);
@@ -189,8 +189,8 @@ __host__ void host_extract_bits(
     host_bootstrap_low_latency<Torus, params>(
         v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
         lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk,
-        lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
-        number_of_samples, 1, max_shared_memory);
+        glwe_dimension, lwe_dimension_out, lwe_dimension_in, base_log_bsk,
+        level_count_bsk, number_of_samples, 1, max_shared_memory);
 
     // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
     // of 0 if the extracted bit was 0 and 1 in the other case
diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu
index 246963314..8651b2456 100644
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -16,8 +16,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
 
   assert(("Error (GPU low latency PBS): base log should be <= 32",
           base_log <= 32));
-  assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
           "1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
@@ -39,36 +37,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
     host_bootstrap_low_latency<uint32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 1024:
     host_bootstrap_low_latency<uint32_t, Degree<1024>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 2048:
     host_bootstrap_low_latency<uint32_t, Degree<2048>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 4096:
     host_bootstrap_low_latency<uint32_t, Degree<4096>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 8192:
     host_bootstrap_low_latency<uint32_t, Degree<8192>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   default:
     break;
@@ -160,8 +163,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
 
   assert(("Error (GPU low latency PBS): base log should be <= 64",
           base_log <= 64));
-  assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
           "1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
@@ -183,36 +184,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
     host_bootstrap_low_latency<uint64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 1024:
     host_bootstrap_low_latency<uint64_t, Degree<1024>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 2048:
     host_bootstrap_low_latency<uint64_t, Degree<2048>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 4096:
     host_bootstrap_low_latency<uint64_t, Degree<4096>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   case 8192:
     host_bootstrap_low_latency<uint64_t, Degree<8192>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
         (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
-        level_count, num_samples, num_lut_vectors, max_shared_memory);
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
     break;
   default:
     break;
diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh
index 693884cf2..ac24ba474 100644
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -25,11 +25,11 @@ using namespace cooperative_groups;
 namespace cg = cooperative_groups;
 
 template <typename Torus, class params>
-__device__ void
-mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
-              double2 *body_join_buffer, double2 *bootstrapping_key,
-              int polynomial_size, int level_count, int iteration,
-              grid_group &grid) {
+__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
+                              double2 *join_buffer, double2 *bootstrapping_key,
+                              int polynomial_size, uint32_t glwe_dimension,
+                              int level_count, int iteration,
+                              grid_group &grid) {
 
   // Switch to the FFT space
   NSMFFT_direct<HalfDegree<params>>(fft);
@@ -40,52 +40,50 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
   // this function, so we end up getting the lines of the bootstrapping key
   // needed to perform the external product in this block (corresponding to
   // the same decomposition level)
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);
 
-  auto bsk_mask_slice =
-      get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
-                             blockIdx.x, polynomial_size, 1, level_count);
-  auto bsk_body_slice =
-      get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
-                             blockIdx.x, polynomial_size, 1, level_count);
+  // Selects all GLWEs in a particular decomposition level
+  auto level_join_buffer =
+      join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
 
   // Perform the matrix multiplication between the GGSW and the GLWE,
   // each block operating on a single level for mask and body
 
-  auto first_processed_bsk =
-      (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
-  auto second_processed_bsk =
-      (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
-
-  auto first_processed_acc =
-      (blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x]
-                        : &body_join_buffer[params::degree / 2 * blockIdx.x];
-  auto second_processed_acc =
-      (blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x]
-                        : &mask_join_buffer[params::degree / 2 * blockIdx.x];
+  // The first product is used to initialize level_join_buffer
+  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+  auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
 
   int tid = threadIdx.x;
-
-  // first product
   for (int i = 0; i < params::opt / 2; i++) {
-    first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid];
+    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
     tid += params::degree / params::opt;
   }
 
   grid.sync();
-  tid = threadIdx.x;
-  // second product
-  for (int i = 0; i < params::opt / 2; i++) {
-    second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid];
-    tid += params::degree / params::opt;
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  for (int j = 1; j < (glwe_dimension + 1); j++) {
+    int idx = (j + blockIdx.y) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto buffer_slice = level_join_buffer + idx * params::degree / 2;
+
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt / 2; i++) {
+      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
+      tid += params::degree / params::opt;
+    }
+    grid.sync();
   }
 
   // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
 
-  // All blocks are synchronized here; after this sync, *_join_buffer has the
-  // values needed from every other block
-  grid.sync();
-
-  auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
+  auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
 
   // copy first product into fft buffer
   tid = threadIdx.x;
@@ -97,7 +95,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
 
   // accumulate rest of the products into fft buffer
   for (int l = 1; l < gridDim.x; l++) {
-    auto cur_src_acc = &src_acc[l * params::degree / 2];
+    auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
     tid = threadIdx.x;
     for (int i = 0; i < params::opt / 2; i++) {
       fft[tid] += cur_src_acc[tid];
@@ -121,19 +119,22 @@ template <typename Torus, class params, sharedMemDegree SMD>
 /*
  * Kernel launched by the low latency version of the
  * bootstrapping, that uses cooperative groups
- * lwe_array_out vector of output lwe s, with length
- * (polynomial_size+1)*num_samples lut_vector - vector of look up tables with
- * length  polynomial_size * num_samples lut_vector_indexes - mapping between
- * lwe_array_in and lut_vector lwe_array_in
- * - vector of lwe inputs with length (lwe_dimension + 1) * num_samples
  *
+ * - lwe_array_out: vector of output lwe s, with length
+ * (glwe_dimension * polynomial_size+1)*num_samples
+ * - lut_vector: vector of look up tables with
+ * length  (glwe_dimension+1) * polynomial_size * num_samples
+ * - lut_vector_indexes: mapping between lwe_array_in and lut_vector
+ * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
+ * num_samples
+ *
+ * Each y-block computes one element of the lwe_array_out.
  */
 __global__ void device_bootstrap_low_latency(
     Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in,
-    double2 *bootstrapping_key, double2 *mask_join_buffer,
-    double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, char *device_mem,
-    int device_memory_size_per_block) {
+    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    char *device_mem, int device_memory_size_per_block) {
 
   grid_group grid = this_grid();
 
@@ -144,6 +145,7 @@ __global__ void device_bootstrap_low_latency(
   char *selected_memory;
   int block_index =
       blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
+  uint32_t glwe_dimension = gridDim.y - 1;
 
   if constexpr (SMD == FULLSM)
     selected_memory = sharedmem;
@@ -163,12 +165,12 @@ __global__ void device_bootstrap_low_latency(
   // this block is operating, in the case of batch bootstraps
   auto block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
 
-  auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2];
+  auto block_lut_vector =
+      &lut_vector[blockIdx.z * params::degree * (glwe_dimension + 1)];
 
-  auto block_mask_join_buffer =
-      &mask_join_buffer[blockIdx.z * level_count * params::degree / 2];
-  auto block_body_join_buffer =
-      &body_join_buffer[blockIdx.z * level_count * params::degree / 2];
+  auto block_join_buffer =
+      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
+                   params::degree / 2];
 
   // Since the space is L1 cache is small, we use the same memory location for
   // the rotated accumulator and the fft accumulator, since we know that the
@@ -179,15 +181,10 @@ __global__ void device_bootstrap_low_latency(
   rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
                         2 * params::degree);
 
-  if (blockIdx.y == 0) {
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, block_lut_vector, b_hat, false, 1);
-  } else {
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
-  }
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                        params::degree / params::opt>(
+      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+      false);
 
   for (int i = 0; i < lwe_dimension; i++) {
     synchronize_threads_in_block();
@@ -200,13 +197,13 @@ __global__ void device_bootstrap_low_latency(
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
         Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat, 1);
+        accumulator, accumulator_rotated, a_hat);
 
     // Perform a rounding to increase the accuracy of the
     // bootstrapped ciphertext
     round_to_closest_multiple_inplace<Torus, params::opt,
                                       params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count, 1);
+        accumulator_rotated, base_log, level_count);
 
     synchronize_threads_in_block();
 
@@ -214,29 +211,33 @@ __global__ void device_bootstrap_low_latency(
     // decomposition, for the mask and the body (so block 0 will have the
     // accumulator decomposed at level 0, 1 at 1, etc.)
     GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated, 1);
+                                           accumulator_rotated);
     gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
 
     // We are using the same memory space for accumulator_fft and
     // accumulator_rotated, so we need to synchronize here to make sure they
     // don't modify the same memory space at the same time
     synchronize_threads_in_block();
+
     // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
-                                 block_mask_join_buffer, block_body_join_buffer,
-                                 bootstrapping_key, polynomial_size,
-                                 level_count, i, grid);
+    mul_ggsw_glwe<Torus, params>(
+        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
+        polynomial_size, glwe_dimension, level_count, i, grid);
+
+    synchronize_threads_in_block();
   }
 
-  auto block_lwe_array_out = &lwe_array_out[blockIdx.z * (polynomial_size + 1)];
+  auto block_lwe_array_out =
+      &lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
+                     blockIdx.y * polynomial_size];
 
-  if (blockIdx.x == 0 && blockIdx.y == 0) {
+  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
     // Perform a sample extract. At this point, all blocks have the result, but
     // we do the computation at block 0 to avoid waiting for extra blocks, in
     // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
-  } else if (blockIdx.x == 0 && blockIdx.y == 1) {
-    block_lwe_array_out[params::degree] = accumulator[0];
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
   }
 }
 
@@ -248,7 +249,7 @@ template <typename Torus, class params>
 __host__ void host_bootstrap_low_latency(
     void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
     uint32_t *lut_vector_indexes, Torus *lwe_array_in,
-    double2 *bootstrapping_key, uint32_t lwe_dimension,
+    double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
     uint32_t max_shared_memory) {
@@ -258,10 +259,8 @@ __host__ void host_bootstrap_low_latency(
 
   int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
                             polynomial_size / 2 * sizeof(double2);
-  double2 *mask_buffer_fft =
-      (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
-  double2 *body_buffer_fft =
-      (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
+  double2 *buffer_fft = (double2 *)cuda_malloc_async(
+      (glwe_dimension + 1) * buffer_size_per_gpu, stream, gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
   // duplicate data for each
@@ -279,35 +278,34 @@ __host__ void host_bootstrap_low_latency(
   char *d_mem;
 
   int thds = polynomial_size / params::opt;
-  dim3 grid(level_count, 2, input_lwe_ciphertext_count);
+  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
 
-  void *kernel_args[12];
+  void *kernel_args[11];
   kernel_args[0] = &lwe_array_out;
   kernel_args[1] = &lut_vector;
   kernel_args[2] = &lwe_array_in;
   kernel_args[3] = &bootstrapping_key;
-  kernel_args[4] = &mask_buffer_fft;
-  kernel_args[5] = &body_buffer_fft;
-  kernel_args[6] = &lwe_dimension;
-  kernel_args[7] = &polynomial_size;
-  kernel_args[8] = &base_log;
-  kernel_args[9] = &level_count;
-  kernel_args[10] = &d_mem;
+  kernel_args[4] = &buffer_fft;
+  kernel_args[5] = &lwe_dimension;
+  kernel_args[6] = &polynomial_size;
+  kernel_args[7] = &base_log;
+  kernel_args[8] = &level_count;
+  kernel_args[9] = &d_mem;
 
   if (max_shared_memory < SM_PART) {
-    kernel_args[11] = &DM_FULL;
+    kernel_args[10] = &DM_FULL;
     check_cuda_error(cudaGetLastError());
     d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
-                                          level_count * 2,
+                                          level_count * (glwe_dimension + 1),
                                       stream, gpu_index);
     check_cuda_error(cudaGetLastError());
     check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
         (void **)kernel_args, 0, *stream));
   } else if (max_shared_memory < SM_FULL) {
-    kernel_args[11] = &DM_PART;
+    kernel_args[10] = &DM_PART;
     d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
-                                          level_count * 2,
+                                          level_count * (glwe_dimension + 1),
                                       stream, gpu_index);
     check_cuda_error(cudaFuncSetAttribute(
         device_bootstrap_low_latency<Torus, params, PARTIALSM>,
@@ -322,7 +320,7 @@ __host__ void host_bootstrap_low_latency(
 
   } else {
     int DM_NONE = 0;
-    kernel_args[11] = &DM_NONE;
+    kernel_args[10] = &DM_NONE;
     d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
     check_cuda_error(cudaFuncSetAttribute(
         device_bootstrap_low_latency<Torus, params, FULLSM>,
@@ -337,8 +335,7 @@ __host__ void host_bootstrap_low_latency(
   check_cuda_error(cudaGetLastError());
   // Synchronize the streams before copying the result to lwe_array_out at the
   // right place
-  cuda_drop_async(mask_buffer_fft, stream, gpu_index);
-  cuda_drop_async(body_buffer_fft, stream, gpu_index);
+  cuda_drop_async(buffer_fft, stream, gpu_index);
   cuda_drop_async(d_mem, stream, gpu_index);
 }
 
diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu
index df3622ec7..417084a9c 100644
--- a/src/circuit_bootstrap.cu
+++ b/src/circuit_bootstrap.cu
@@ -13,8 +13,6 @@ void cuda_circuit_bootstrap_32(
     uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
     uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
     uint32_t number_of_samples, uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
           "512, 1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
@@ -137,8 +135,6 @@ void cuda_circuit_bootstrap_64(
     uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
     uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
     uint32_t number_of_samples, uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
           "512, 1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
diff --git a/src/crypto/gadget.cuh b/src/crypto/gadget.cuh
index 863288884..51fe9d2eb 100644
--- a/src/crypto/gadget.cuh
+++ b/src/crypto/gadget.cuh
@@ -12,6 +12,7 @@
  * arbitrary amount of levels by using decompose_and_compress_level().
  *
  * This class always decomposes the entire set of num_poly polynomials.
+ * By default, it works on a single polynomial.
  */
 #pragma once
 template <typename T, class params> class GadgetMatrix {
@@ -28,7 +29,7 @@ private:
 
 public:
   __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
-                          uint32_t num_poly)
+                          uint32_t num_poly = 1)
       : base_log(base_log), level_count(level_count), num_poly(num_poly),
         state(state) {
 
diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh
index 1076bf755..dc86be2e5 100644
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -61,11 +61,13 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
  * Performs acc = acc * (X^ä + 1) if zeroAcc = false
  * Performs acc = 0 if zeroAcc
  * takes single buffer and calculates inplace.
+ *
+ *  By default, it works on a single polynomial.
  */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
                                                       uint32_t j, bool zeroAcc,
-                                                      uint32_t num_poly) {
+                                                      uint32_t num_poly = 1) {
   constexpr int degree = block_size * elems_per_thread;
   for (int z = 0; z < num_poly; z++) {
     T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -105,10 +107,12 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
  *
  * Performs result_acc = acc * (X^ä - 1) - acc
  * takes single buffer as input and returns a single rotated buffer
+ *
+ *  By default, it works on a single polynomial.
  */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
-    T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
+    T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
   constexpr int degree = block_size * elems_per_thread;
   for (int z = 0; z < num_poly; z++) {
     T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -138,11 +142,13 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
 /*
  * Receives num_poly  concatenated polynomials of type T. For each performs a
  * rounding to increase accuracy of the PBS. Calculates inplace.
+ *
+ *  By default, it works on a single polynomial.
  */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
                                                   int level_count,
-                                                  uint32_t num_poly) {
+                                                  uint32_t num_poly = 1) {
   constexpr int degree = block_size * elems_per_thread;
   for (int z = 0; z < num_poly; z++) {
     T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,20 +198,21 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
   }
 }
 
-// Extracts the body of a GLWE with dimension glwe_dimension
+// Extracts the body of a GLWE.
+// k is the offset to find the body element / polynomial in the lwe_array_out /
+// accumulator
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t glwe_dimension) {
+                                    uint32_t k) {
   // Set first coefficient of the accumulator as the body of the LWE sample
-  lwe_array_out[glwe_dimension * params::degree] =
-      accumulator[glwe_dimension * params::degree];
+  lwe_array_out[k * params::degree] = accumulator[k * params::degree];
 }
 
-// Extracts the mask of a GLWE with dimension glwe_dimension
+// Extracts the mask from num_poly polynomials individually
 template <typename Torus, class params>
 __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t glwe_dimension) {
-  for (int z = 0; z < glwe_dimension; z++) {
+                                    uint32_t num_poly = 1) {
+  for (int z = 0; z < num_poly; z++) {
     Torus *lwe_array_out_slice =
         (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
     Torus *accumulator_slice =
diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu
index 9f148f9fd..6c6a9fb2d 100644
--- a/src/vertical_packing.cu
+++ b/src/vertical_packing.cu
@@ -17,8 +17,6 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
               polynomial_size == 2048 || polynomial_size == 4096 ||
               polynomial_size == 8192));
   // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
           "be >= 1 ",
           r >= 1));
@@ -99,8 +97,6 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
               polynomial_size == 2048 || polynomial_size == 4096 ||
               polynomial_size == 8192));
   // For larger k we will need to adjust the mask size
-  assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
           "be >= 1 ",
           r >= 1));
diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu
index c144a3654..4a71beadb 100644
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -35,8 +35,6 @@ void cuda_circuit_bootstrap_vertical_packing_64(
     uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
     uint32_t number_of_inputs, uint32_t lut_number,
     uint32_t max_shared_memory) {
-  assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
           "512, 1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
@@ -149,8 +147,6 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                      uint32_t number_of_bits_of_message_including_padding,
                      uint32_t number_of_bits_to_extract,
                      uint32_t number_of_inputs, uint32_t max_shared_memory) {
-  assert(("Error (GPU WOP PBS): glwe_dimension should be equal to 1",
-          glwe_dimension == 1));
   assert(("Error (GPU WOP PBS): polynomial_size should be one of "
           "512, 1024, 2048, 4096, 8192",
           polynomial_size == 512 || polynomial_size == 1024 ||
diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh
index 9a8bb9acb..1760cad22 100644
--- a/src/wop_bootstrap.cuh
+++ b/src/wop_bootstrap.cuh
@@ -174,8 +174,8 @@ __host__ void host_wop_pbs(
       lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer,
       lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk,
       number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
-      base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-      number_of_inputs, max_shared_memory);
+      glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
+      level_count_ksk, number_of_inputs, max_shared_memory);
   check_cuda_error(cudaGetLastError());
   cuda_drop_async(lut_pbs, stream, gpu_index);
   cuda_drop_async(lut_vector_indexes, stream, gpu_index);