fix(gpu): general fixes and improvements to PBS

- update pbs test parameters to match tfhe-rs' integer tests - refactor mul_ggsw_glwe to make it easier to read - fix the way we accumulate the external product result on multi-bit PBS
2026-01-10 07:08:03 -05:00 · 2024-11-04 15:12:28 -03:00
parent eac30027e9
commit b041608d25
15 changed files with 406 additions and 561 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -106,7 +106,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  uint32_t lwe_chunk_size;
  double2 *keybundle_fft;
  Torus *global_accumulator;
-  double2 *global_accumulator_fft;
+  double2 *global_join_buffer;

  PBS_VARIANT pbs_variant;

@@ -225,10 +225,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
          stream, gpu_index);
      global_accumulator = (Torus *)cuda_malloc_async(
-          num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
-          gpu_index);
-      global_accumulator_fft = (double2 *)cuda_malloc_async(
-          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
+          input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+              sizeof(Torus),
+          stream, gpu_index);
+      global_join_buffer = (double2 *)cuda_malloc_async(
+          level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+              (polynomial_size / 2) * sizeof(double2),
          stream, gpu_index);
    }
  }
@@ -260,7 +262,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {

    cuda_drop_async(keybundle_fft, stream, gpu_index);
    cuda_drop_async(global_accumulator, stream, gpu_index);
-    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -69,7 +69,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  Torus *global_accumulator;
-  double2 *global_accumulator_fft;
+  double2 *global_join_buffer;

  PBS_VARIANT pbs_variant;

@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        // Otherwise, both kernels run all in shared memory
        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                (polynomial_size / 2) * sizeof(double2),
            stream, gpu_index);
@@ -147,7 +147,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        // Otherwise, both kernels run all in shared memory
        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                polynomial_size / 2 * sizeof(double2),
            stream, gpu_index);
@@ -194,7 +194,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        // Otherwise, both kernels run all in shared memory
        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

-        global_accumulator_fft = (double2 *)cuda_malloc_async(
+        global_join_buffer = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                polynomial_size / 2 * sizeof(double2),
            stream, gpu_index);
@@ -208,7 +208,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

  void release(cudaStream_t stream, uint32_t gpu_index) {
    cuda_drop_async(d_mem, stream, gpu_index);
-    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);

    if (pbs_variant == DEFAULT)
      cuda_drop_async(global_accumulator, stream, gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_CRYPTO_CUH
 #define CNCRT_CRPYTO_CUH

+#include "crypto/torus.cuh"
 #include "device.h"
 #include <cstdint>

@@ -21,7 +22,6 @@ private:
  uint32_t base_log;
  uint32_t mask;
  uint32_t num_poly;
-  int current_level;
  T mask_mod_b;
  T *state;

@@ -32,7 +32,6 @@ public:
        state(state) {

    mask_mod_b = (1ll << base_log) - 1ll;
-    current_level = level_count;
    int tid = threadIdx.x;
    for (int i = 0; i < num_poly * params::opt; i++) {
      state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
@@ -52,8 +51,6 @@ public:
  // Decomposes a single polynomial
  __device__ void decompose_and_compress_next_polynomial(double2 *result,
                                                         int j) {
-    if (j == 0)
-      current_level -= 1;

    int tid = threadIdx.x;
    auto state_slice = state + j * params::degree;
@@ -72,8 +69,8 @@ public:
      res_re -= carry_re << base_log;
      res_im -= carry_im << base_log;

-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
+      typecast_torus_to_double(res_re, result[tid].x);
+      typecast_torus_to_double(res_im, result[tid].y);

      tid += params::degree / params::opt;
    }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,6 +1,7 @@
 #ifndef CNCRT_TORUS_CUH
 #define CNCRT_TORUS_CUH

+#include "device.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
 #include "utils/kernel_dimensions.cuh"
@@ -43,6 +44,21 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) {
  typecast_double_to_torus(round(frac), r);
 }

+template <typename T>
+__device__ inline void typecast_torus_to_double(T x, double &r);
+
+template <>
+__device__ inline void typecast_torus_to_double<uint32_t>(uint32_t x,
+                                                          double &r) {
+  r = __int2double_rn(x);
+}
+
+template <>
+__device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
+                                                          double &r) {
+  r = __ll2double_rn(x);
+}
+
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                              uint32_t level_count) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -7,6 +7,7 @@
 #include "fft/bnsmfft.cuh"
 #include "helper_multi_gpu.h"
 #include "pbs/programmable_bootstrap_multibit.h"
+#include "polynomial/polynomial_math.cuh"

 using namespace cooperative_groups;
 namespace cg = cooperative_groups;
@@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
                        double2 *global_memory_buffer, uint32_t polynomial_size,
                        uint32_t glwe_dimension, bool support_dsm);

-template <typename Torus, typename G, class params>
+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension x relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
 __device__ void
-mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
-              const double2 *__restrict__ bootstrapping_key,
-              int polynomial_size, uint32_t glwe_dimension, int level_count,
-              int iteration, G &group, bool support_dsm = false) {
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  // Get the pieces of the bootstrapping key that will be needed for the
-  // external product; blockIdx.x is the ID of the block that's executing
-  // this function, so we end up getting the lines of the bootstrapping key
-  // needed to perform the external product in this block (corresponding to
-  // the same decomposition level)
-  auto bsk_slice = get_ith_mask_kth_block(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
-      glwe_dimension, level_count);
-
-  // Perform the matrix multiplication between the GGSW and the GLWE,
-  // each block operating on a single level for mask and body
+mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
+                                const double2 *__restrict__ bootstrapping_key,
+                                int iteration, G &group,
+                                bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.x;

  // The first product is used to initialize level_join_buffer
-  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-  auto buffer_slice =
-      get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
-                                 polynomial_size, glwe_dimension, support_dsm);
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
-    tid += params::degree / params::opt;
-  }
-
-  group.sync();

  // Continues multiplying fft by every polynomial in that particular bsk level
  // Each y-block accumulates in a different polynomial at each iteration
-  for (int j = 1; j < (glwe_dimension + 1); j++) {
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
    int idx = (j + this_block_rank) % (glwe_dimension + 1);

-    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2;
    auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
                                                   join_buffer, polynomial_size,
                                                   glwe_dimension, support_dsm);

-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
-      tid += params::degree / params::opt;
-    }
+    polynomial_product_accumulate_in_fourier_domain<params, double2>(
+        buffer_slice, fft, bsk_poly, j == 0);
    group.sync();
  }

@@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
  // All blocks are synchronized here; after this sync, level_join_buffer has
  // the values needed from every other block

-  auto src_acc =
-      get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
-                                 polynomial_size, glwe_dimension, support_dsm);
-
-  // copy first product into fft buffer
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = src_acc[tid];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
  // accumulate rest of the products into fft buffer
-  for (int l = 1; l < gridDim.x; l++) {
+  for (int l = 0; l < level_count; l++) {
    auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
                                                  join_buffer, polynomial_size,
                                                  glwe_dimension, support_dsm);
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] += cur_src_acc[tid];
-      tid += params::degree / params::opt;
-    }
+
+    polynomial_accumulate_in_fourier_domain<params>(fft, cur_src_acc, l == 0);
  }

  synchronize_threads_in_block();
-
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  add_to_torus<Torus, params>(fft, accumulator);
-
-  __syncthreads();
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg(
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, grid_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
+    mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
+        accumulator_fft, block_join_buffer, bootstrapping_key, i, grid);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator);
  }

  auto block_lwe_array_out =
@@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg(
                         (glwe_dimension * polynomial_size + 1) +
                     blockIdx.y * polynomial_size];

-  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-    // Perform a sample extract. At this point, all blocks have the result, but
-    // we do the computation at block 0 to avoid waiting for extra blocks, in
-    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
+  if (blockIdx.x == 0) {
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];

-        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 1, i * lut_stride);
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
      }
-    }
-  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {

-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];

-        sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 0, i * lut_stride);
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
      }
    }
  }
@@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg(
  uint64_t partial_dm = full_dm - partial_sm;

  int8_t *d_mem = buffer->d_mem;
-  double2 *buffer_fft = buffer->global_accumulator_fft;
+  double2 *buffer_fft = buffer->global_join_buffer;

  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
        uint32_t lut_count, uint32_t lut_stride) {
-
  grid_group grid = this_grid();

  // We use shared memory for the polynomials that are used often during the
@@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    selected_memory = &device_mem[block_index * device_memory_size_per_block];
  }

-  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated = (Torus *)selected_memory;
  double2 *accumulator_fft =
-      (double2 *)accumulator +
+      (double2 *)accumulator_rotated +
      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));

  if constexpr (SMD == PARTIALSM)
@@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
                   params::degree / 2];

-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+  Torus *global_accumulator_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];

-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.z * keybundle_size_per_input];

  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
@@ -87,12 +85,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)

    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
+        b_hat, false);
  } else {
-    // Load the accumulator calculated in previous iterations
+    // Load the accumulator_rotated calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
+        global_accumulator_slice, accumulator_rotated);
  }

  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
@@ -100,79 +98,82 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator, base_log, level_count);
+        accumulator_rotated, base_log, level_count);

-    // Decompose the accumulator. Each block gets one level of the
+    // Decompose the accumulator_rotated. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
-    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    // accumulator_rotated decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, grid_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, keybundle,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
+    mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
+        accumulator_fft, block_join_buffer, keybundle, i, grid);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
  }

-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
-    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                           (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
+  auto accumulator = accumulator_rotated;

-    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      // Always extract one by default
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  if (blockIdx.x == 0) {
+    if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+      auto block_lwe_array_out =
+          &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                             (glwe_dimension * polynomial_size + 1) +
+                         blockIdx.y * polynomial_size];

-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
+      if (blockIdx.y < glwe_dimension) {
+        // Perform a sample extract. At this point, all blocks have the result,
+        // but we do the computation at block 0 to avoid waiting for extra
+        // blocks, in case they're not synchronized Always extract one by
+        // default
+        sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);

-          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 1, i * lut_stride);
-        }
-      }
-
-    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
-
-          sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 0, i * lut_stride);
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 1, i * lut_stride);
+          }
+        }
+
+      } else if (blockIdx.y == glwe_dimension) {
+
+        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 0, i * lut_stride);
+          }
        }
      }
+    } else {
+      // Load the accumulator calculated in previous iterations
+      copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+          accumulator, global_accumulator_slice);
    }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
  }
 }

@@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop(
    uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
    uint32_t lut_stride) {

-  auto lwe_chunk_size = buffer->lwe_chunk_size;
-  uint64_t full_dm =
+  uint64_t full_sm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
-  uint64_t partial_dm =
+  uint64_t partial_sm =
      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
+
+  auto full_dm = full_sm;
+  auto partial_dm = full_sm - partial_sm;
  uint64_t no_dm = 0;

+  auto lwe_chunk_size = buffer->lwe_chunk_size;
  int max_shared_memory = cuda_get_max_shared_memory(0);
  cudaSetDevice(gpu_index);

@@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;

  auto d_mem = buffer->d_mem_acc_cg;
  auto keybundle_fft = buffer->keybundle_fft;
  auto global_accumulator = buffer->global_accumulator;
-  auto buffer_fft = buffer->global_accumulator_fft;
+  auto join_buffer = buffer->global_join_buffer;

  void *kernel_args[22];
  kernel_args[0] = &lwe_array_out;
@@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop(
  kernel_args[4] = &lwe_array_in;
  kernel_args[5] = &lwe_input_indexes;
  kernel_args[6] = &keybundle_fft;
-  kernel_args[7] = &buffer_fft;
+  kernel_args[7] = &join_buffer;
  kernel_args[8] = &global_accumulator;
  kernel_args[9] = &lwe_dimension;
  kernel_args[10] = &glwe_dimension;
@@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop(
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
            Torus, params, PARTIALSM>,
-        grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
+        grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
  } else {
    kernel_args[19] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
            Torus, params, FULLSM>,
-        grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
+        grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        const Torus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
        const double2 *__restrict__ bootstrapping_key,
-        Torus *global_accumulator, double2 *global_accumulator_fft,
+        Torus *global_accumulator, double2 *global_join_buffer,
        uint32_t lwe_iteration, uint32_t lwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
        int8_t *device_mem, uint64_t device_memory_size_per_block) {
@@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;

  double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
+      global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+                            blockIdx.z * level_count * (glwe_dimension + 1)) *
+                               (polynomial_size / 2);

  if (lwe_iteration == 0) {
    // First iteration
@@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes,
        const double2 *__restrict__ bootstrapping_key,
-        Torus *global_accumulator, double2 *global_accumulator_fft,
+        Torus *global_accumulator, double2 *global_join_buffer,
        uint32_t lwe_iteration, uint32_t lwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
@@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    accumulator_fft = (double2 *)sharedmem;

  for (int level = 0; level < level_count; level++) {
-    double2 *global_fft_slice = global_accumulator_fft +
-                                (level + blockIdx.x * level_count) *
-                                    (glwe_dimension + 1) * (params::degree / 2);
+    double2 *global_fft_slice =
+        global_join_buffer + (level + blockIdx.x * level_count) *
+                                 (glwe_dimension + 1) * (params::degree / 2);

    for (int j = 0; j < (glwe_dimension + 1); j++) {
      double2 *fft = global_fft_slice + j * params::degree / 2;
@@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
  }
  // Otherwise, both kernels run all in shared memory
  uint64_t buffer_size = device_mem +
-                         // global_accumulator_fft
+                         // global_join_buffer
                         (glwe_dimension + 1) * level_count *
                             input_lwe_ciphertext_count *
                             (polynomial_size / 2) * sizeof(double2) +
@@ -368,7 +367,7 @@ __host__ void execute_step_one(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
+    Torus *global_accumulator, double2 *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -383,21 +382,21 @@ __host__ void execute_step_one(
    device_programmable_bootstrap_step_one<Torus, params, NOSM>
        <<<grid, thds, 0, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, full_dm);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, partial_dm);
  } else {
    device_programmable_bootstrap_step_one<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, 0);
  }
@@ -409,7 +408,7 @@ __host__ void execute_step_two(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
    Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
+    Torus *global_accumulator, double2 *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -425,21 +424,21 @@ __host__ void execute_step_two(
    device_programmable_bootstrap_step_two<Torus, params, NOSM>
        <<<grid, thds, 0, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, full_dm, lut_count, lut_stride);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, partial_dm, lut_count, lut_stride);
  } else {
    device_programmable_bootstrap_step_two<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            bootstrapping_key, global_accumulator, global_join_buffer,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
            level_count, d_mem, 0, lut_count, lut_stride);
  }
@@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap(
  uint64_t full_dm_step_two = full_sm_step_two;

  Torus *global_accumulator = pbs_buffer->global_accumulator;
-  double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
+  double2 *global_join_buffer = pbs_buffer->global_join_buffer;
  int8_t *d_mem = pbs_buffer->d_mem;

  for (int i = 0; i < lwe_dimension; i++) {
    execute_step_one<Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, global_accumulator,
-        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
    execute_step_two<Torus, params>(
        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
        lut_vector_indexes, bootstrapping_key, global_accumulator,
-        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
        lut_count, lut_stride);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -50,7 +50,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    uint64_t device_memory_size_per_block) {

  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory = sharedmem;
+  int8_t *selected_memory;

  if constexpr (SMD == FULLSM) {
    selected_memory = sharedmem;
@@ -190,14 +190,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                  (glwe_dimension + 1)];

  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];

  double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
+      &global_accumulator_fft[(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+                               blockIdx.z * level_count *
+                                   (glwe_dimension + 1)) *
+                              (polynomial_size / 2)];

  if (lwe_iteration == 0) {
    // First iteration
@@ -249,8 +249,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    device_multi_bit_programmable_bootstrap_accumulate_step_two(
        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
-        double2 *global_accumulator_fft, uint32_t lwe_dimension,
-        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+        double2 *join_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+        uint32_t polynomial_size, uint32_t level_count,
        uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, int8_t *device_mem,
        uint64_t device_memory_size_per_block, uint32_t lut_count,
@@ -274,30 +274,29 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  double2 *accumulator_fft = (double2 *)selected_memory;

  //
-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.x * lwe_chunk_size * level_count *
-                                 (glwe_dimension + 1) * (glwe_dimension + 1) *
-                                 (polynomial_size / 2);
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.x * lwe_chunk_size * level_count *
+                       (glwe_dimension + 1) * (glwe_dimension + 1) *
+                       (polynomial_size / 2)];

-  double2 *global_accumulator_fft_input =
-      global_accumulator_fft +
-      blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
+  double2 *join_buffer_slice =
+      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
+                   (polynomial_size / 2)];

  for (int level = 0; level < level_count; level++) {
    double2 *global_fft_slice =
-        global_accumulator_fft_input +
-        level * (glwe_dimension + 1) * (polynomial_size / 2);
+        &join_buffer_slice[level * (glwe_dimension + 1) *
+                           (polynomial_size / 2)];

    for (int j = 0; j < (glwe_dimension + 1); j++) {
-      double2 *fft = global_fft_slice + j * params::degree / 2;
+      double2 *fft = &global_fft_slice[j * params::degree / 2];

      // Get the bootstrapping key piece necessary for the multiplication
      // It is already in the Fourier domain
      auto bsk_slice =
          get_ith_mask_kth_block(keybundle, iteration, j, level,
                                 polynomial_size, glwe_dimension, level_count);
-      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+      auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2];

      polynomial_product_accumulate_in_fourier_domain<params, double2>(
          accumulator_fft, fft, bsk_poly, !level && !j);
@@ -308,8 +307,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // accumulator
  NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+      &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
+                          params::degree];

  add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
  synchronize_threads_in_block();
@@ -499,8 +498,6 @@ __host__ void execute_compute_keybundle(
  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;

  uint32_t keybundle_size_per_input =
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -559,7 +556,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
  //
  auto d_mem = buffer->d_mem_acc_step_one;
  auto global_accumulator = buffer->global_accumulator;
-  auto global_accumulator_fft = buffer->global_accumulator_fft;
+  auto global_accumulator_fft = buffer->global_join_buffer;

  dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
  dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -611,7 +608,7 @@ __host__ void execute_step_two(
  auto d_mem = buffer->d_mem_acc_step_two;
  auto keybundle_fft = buffer->keybundle_fft;
  auto global_accumulator = buffer->global_accumulator;
-  auto global_accumulator_fft = buffer->global_accumulator_fft;
+  auto global_accumulator_fft = buffer->global_join_buffer;

  dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
  dim3 thds(polynomial_size / params::opt, 1, 1);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -133,18 +133,17 @@ __global__ void device_programmable_bootstrap_tbc(
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, cluster_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
-        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
-
+    mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
+        accumulator_fft, block_join_buffer, bootstrapping_key, i, cluster,
+        support_dsm);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator);
  }

  auto block_lwe_array_out =
@@ -152,42 +151,44 @@ __global__ void device_programmable_bootstrap_tbc(
                         (glwe_dimension * polynomial_size + 1) +
                     blockIdx.y * polynomial_size];

-  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-    // Perform a sample extract. At this point, all blocks have the result, but
-    // we do the computation at block 0 to avoid waiting for extra blocks, in
-    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  if (blockIdx.x == 0) {
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);

-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];

-        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 1, i * lut_stride);
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
      }
-    }
-  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

-    if (lut_count > 1) {
-      for (int i = 1; i < lut_count; i++) {
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {

-        auto next_lwe_array_out =
-            lwe_array_out +
-            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-        auto next_block_lwe_array_out =
-            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                    (glwe_dimension * polynomial_size + 1) +
-                                blockIdx.y * polynomial_size];
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];

-        sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                           accumulator, 0, i * lut_stride);
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
      }
    }
  }
@@ -287,7 +288,7 @@ __host__ void host_programmable_bootstrap_tbc(
  uint64_t partial_dm = full_dm - partial_sm;

  int8_t *d_mem = buffer->d_mem;
-  double2 *buffer_fft = buffer->global_accumulator_fft;
+  double2 *buffer_fft = buffer->global_join_buffer;

  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -54,9 +54,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    selected_memory = &device_mem[block_index * device_memory_size_per_block];
  }

-  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated = (Torus *)selected_memory;
  double2 *accumulator_fft =
-      (double2 *)accumulator +
+      (double2 *)accumulator_rotated +
      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));

  if constexpr (SMD == PARTIALSM) {
@@ -78,13 +78,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
                   params::degree / 2];

-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+  Torus *global_accumulator_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
+                          params::degree];

-  const double2 *keybundle = keybundle_array +
-                             // select the input
-                             blockIdx.z * keybundle_size_per_input;
+  const double2 *keybundle =
+      &keybundle_array[blockIdx.z * keybundle_size_per_input];

  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
@@ -94,12 +93,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)

    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
+        b_hat, false);
  } else {
    // Load the accumulator calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
+        global_accumulator_slice, accumulator_rotated);
  }

  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
@@ -107,75 +106,78 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator, base_log, level_count);
+        accumulator_rotated, base_log, level_count);

    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
+    NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, cluster_group, params>(
-        accumulator, accumulator_fft, block_join_buffer, keybundle,
-        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
-
+    mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
+        accumulator_fft, block_join_buffer, keybundle, i, cluster, support_dsm);
+    NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
    synchronize_threads_in_block();
+
+    add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
  }

-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
-    auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                           (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
+  auto accumulator = accumulator_rotated;

-    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  if (blockIdx.x == 0) {
+    if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+      auto block_lwe_array_out =
+          &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                             (glwe_dimension * polynomial_size + 1) +
+                         blockIdx.y * polynomial_size];

-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
+      if (blockIdx.y < glwe_dimension) {
+        // Perform a sample extract. At this point, all blocks have the result,
+        // but we do the computation at block 0 to avoid waiting for extra
+        // blocks, in case they're not synchronized
+        sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);

-          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 1, i * lut_stride);
-        }
-      }
-    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-      if (lut_count > 1) {
-        for (int i = 1; i < lut_count; i++) {
-
-          auto next_lwe_array_out =
-              lwe_array_out +
-              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
-          auto next_block_lwe_array_out =
-              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
-                                      (glwe_dimension * polynomial_size + 1) +
-                                  blockIdx.y * polynomial_size];
-
-          sample_extract_body<Torus, params>(next_block_lwe_array_out,
-                                             accumulator, 0, i * lut_stride);
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 1, i * lut_stride);
+          }
+        }
+      } else if (blockIdx.y == glwe_dimension) {
+        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+        if (lut_count > 1) {
+          for (int i = 1; i < lut_count; i++) {
+
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                               accumulator, 0, i * lut_stride);
+          }
        }
      }
+    } else {
+      // Load the accumulator calculated in previous iterations
+      copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+          accumulator, global_accumulator_slice);
    }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
  }
 }

@@ -326,13 +328,11 @@ __host__ void execute_tbc_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-  if (chunk_size == 0)
-    return;

  auto d_mem = buffer->d_mem_acc_tbc;
  auto keybundle_fft = buffer->keybundle_fft;
  auto global_accumulator = buffer->global_accumulator;
-  auto buffer_fft = buffer->global_accumulator_fft;
+  auto buffer_fft = buffer->global_join_buffer;

  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
  dim3 thds(polynomial_size / params::opt, 1, 1);
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -56,8 +56,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
                                      bool zeroAcc, uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
-    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
-    const T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
+    T *accumulator_slice = &accumulator[z * degree];
+    const T *input_slice = &input[z * degree];

    int tid = threadIdx.x;
    if (zeroAcc) {
@@ -66,9 +66,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
        tid += block_size;
      }
    } else {
-      tid = threadIdx.x;
-      for (int i = 0; i < elems_per_thread; i++) {
-        if (j < degree) {
+      if (j < degree) {
+        for (int i = 0; i < elems_per_thread; i++) {
          // if (tid < degree - j)
          //  accumulator_slice[tid] = input_slice[tid + j];
          // else
@@ -76,8 +75,11 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
          int x = tid + j - SEL(degree, 0, tid < degree - j);
          accumulator_slice[tid] =
              SEL(-1, 1, tid < degree - j) * input_slice[x];
-        } else {
-          int32_t jj = j - degree;
+          tid += block_size;
+        }
+      } else {
+        int32_t jj = j - degree;
+        for (int i = 0; i < elems_per_thread; i++) {
          // if (tid < degree - jj)
          //  accumulator_slice[tid] = -input_slice[tid + jj];
          // else
@@ -85,8 +87,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
          int x = tid + jj - SEL(degree, 0, tid < degree - jj);
          accumulator_slice[tid] =
              SEL(1, -1, tid < degree - jj) * input_slice[x];
+          tid += block_size;
        }
-        tid += block_size;
      }
    }
  }
@@ -160,9 +162,13 @@ __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
  }
 }

+/**
+ * In case of classical PBS, this method should accumulate the result.
+ * In case of multi-bit PBS, it should overwrite.
+ */
 template <typename Torus, class params>
 __device__ void add_to_torus(double2 *m_values, Torus *result,
-                             bool init_torus = false) {
+                             bool overwrite_result = false) {
  int tid = threadIdx.x;
 #pragma unroll
  for (int i = 0; i < params::opt / 2; i++) {
@@ -175,7 +181,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
    Torus torus_imag = 0;
    typecast_double_round_to_torus<Torus>(double_imag, torus_imag);

-    if (init_torus) {
+    if (overwrite_result) {
      result[tid] = torus_real;
      result[tid + params::degree / 2] = torus_imag;
    } else {
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -3,6 +3,7 @@

 #include "crypto/torus.cuh"
 #include "parameters.cuh"
+#include "types/complex/operations.cuh"

 template <typename T>
 __device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
@@ -55,6 +56,27 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
  }
 }

+// Computes result += x
+// If init_accumulator is set, assumes that result was not initialized and does
+// that with the outcome of first * second
+template <class params>
+__device__ void
+polynomial_accumulate_in_fourier_domain(double2 *result, double2 *x,
+                                        bool init_accumulator = false) {
+  auto tid = threadIdx.x;
+  if (init_accumulator) {
+    for (int i = 0; i < params::opt / 2; i++) {
+      result[tid] = x[tid];
+      tid += params::degree / params::opt;
+    }
+  } else {
+    for (int i = 0; i < params::opt / 2; i++) {
+      result[tid] += x[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
 // This method expects to work with polynomial_size / compression_params::opt
 // threads in the x-block If init_accumulator is set, assumes that result was
 // not initialized and does that with the outcome of first * second
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -233,147 +233,15 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
        // message_modulus, carry_modulus, number_of_inputs, repetitions,
        // samples
-        // BOOLEAN_DEFAULT_PARAMETERS
+        // PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
        (ClassicalProgrammableBootstrapTestParams){
-            777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 2, 2, 40},
-        // BOOLEAN_TFHE_LIB_PARAMETERS
+            887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4,
+            100, 1, 1},
+        // PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
        (ClassicalProgrammableBootstrapTestParams){
-            830, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
-            new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
-            1, 2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-        (ClassicalProgrammableBootstrapTestParams){
-            684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            656, 2, 512,
-            new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
-            new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            742, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            745, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
-            2, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-        (ClassicalProgrammableBootstrapTestParams){
-            807, 1, 4096,
-            new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
-            2, 1, 40},
-        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            915, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
-            2, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-        (ClassicalProgrammableBootstrapTestParams){
-            864, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
-            2, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            930, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
-            1, 2, 1, 1},
-
-        // BOOLEAN_DEFAULT_PARAMETERS
-        (ClassicalProgrammableBootstrapTestParams){
-            777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 100, 2, 40},
-        // BOOLEAN_TFHE_LIB_PARAMETERS
-        (ClassicalProgrammableBootstrapTestParams){
-            830, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
-            new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
-            1, 100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-        (ClassicalProgrammableBootstrapTestParams){
-            684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
-            2, 100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            656, 2, 512,
-            new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
-            new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            742, 2, 1024,
-            new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            745, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
-            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
-            100, 2, 40},
-        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-        (ClassicalProgrammableBootstrapTestParams){
-            807, 1, 4096,
-            new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
-            100, 1, 40},
-        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            915, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
-            100, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-        (ClassicalProgrammableBootstrapTestParams){
-            864, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
-            100, 1, 2},
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-        (ClassicalProgrammableBootstrapTestParams){
-            930, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
-            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
-            1, 100, 1, 1});
+            977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07),
+            new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100,
+            1, 1});
 std::string printParamName(
    ::testing::TestParamInfo<ClassicalProgrammableBootstrapTestParams> p) {
  ClassicalProgrammableBootstrapTestParams params = p.param;
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -171,70 +171,44 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
  }
 }

+/**
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+  int grouping_factor;
+  int repetitions;
+  int samples;
+ */
 // Defines for which parameters set the PBS will be tested.
 // It executes each src for all pairs on phis X qs (Cartesian product)
 ::testing::internal::ParamGenerator<MultiBitProgrammableBootstrapTestParams>
    multipbs_params_u64 = ::testing::Values(
-        // fast src
+        // PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
        (MultiBitProgrammableBootstrapTestParams){
-            16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
-            2, 1, 2, 1, 10},
+            882, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 8, 8,
+            100, 3, 1, 1},
+        // PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
        (MultiBitProgrammableBootstrapTestParams){
-            16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
-            2, 128, 2, 1, 10},
-        // 4_bits_multi_bit_group_2
+            978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
+            new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
+            100, 3, 1, 1},
+        // PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
        (MultiBitProgrammableBootstrapTestParams){
-            818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
-            2, 1, 2, 1, 10},
+            836, 1, 2048, new_gaussian_from_std_dev((3.433444883863949e-06)),
+            new_gaussian_from_std_dev((2.845267479601915e-15)), 22, 2, 4, 4,
+            100, 2, 1, 1},
+        // PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
        (MultiBitProgrammableBootstrapTestParams){
-            818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
-            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
-            2, 128, 2, 1, 10},
-        // 4_bits_multi_bit_group_3
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            1, 3, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            1, 3, 1, 1},
-
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 1024,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 2048,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 4096,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 1},
-        (MultiBitProgrammableBootstrapTestParams){
-            888, 1, 16384,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 1},
-        (MultiBitProgrammableBootstrapTestParams){
-            972, 1, 8192,
-            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
-            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
-            68, 3, 1, 1});
+            978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
+            new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
+            100, 2, 1, 1});

 std::string printParamName(
    ::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {