feat(gpu): implement 128-bit multi-bit PBS

2026-01-09 14:47:56 -05:00 · 2025-06-11 11:56:06 -03:00
parent e1beea5ecb
commit 8c88678ee8
35 changed files with 2751 additions and 156 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
+    uint32_t polynomial_size);
+template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size);
 template <typename Torus>
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(

 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size);
-
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle);
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle);
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
+  int8_t *d_mem_keybundle = NULL;
+  int8_t *d_mem_acc_step_one = NULL;
+  int8_t *d_mem_acc_step_two = NULL;
+  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;
+  uint32_t lwe_chunk_size;
+  double *keybundle_fft;
+  __uint128_t *global_accumulator;
+  double *global_join_buffer;
+
+  PBS_VARIANT pbs_variant;
+  bool gpu_memory_allocated;
+
+  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+                 uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
+                 bool allocate_gpu_memory, uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    cuda_set_device(gpu_index);
+
+    this->pbs_variant = pbs_variant;
+    this->lwe_chunk_size = lwe_chunk_size;
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    // default
+    uint64_t full_sm_keybundle =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_one =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_two =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
+            __uint128_t>(polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    // cg
+    uint64_t full_sm_cg_accumulate =
+        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
+            polynomial_size);
+    uint64_t partial_sm_cg_accumulate =
+        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
+            __uint128_t>(polynomial_size);
+
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+
+    // Keybundle
+    if (max_shared_memory < full_sm_keybundle)
+      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
+          size_tracker, allocate_gpu_memory);
+
+    switch (pbs_variant) {
+    case PBS_VARIANT::CG:
+      // Accumulator CG
+      if (max_shared_memory < partial_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      break;
+    case PBS_VARIANT::DEFAULT:
+      // Accumulator step one
+      if (max_shared_memory < partial_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+
+      // Accumulator step two
+      if (max_shared_memory < full_sm_accumulate_step_two)
+        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
+        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(__uint128_t),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+
+    if (d_mem_keybundle)
+      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
+                                         gpu_memory_allocated);
+    switch (pbs_variant) {
+    case DEFAULT:
+      if (d_mem_acc_step_one)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
+                                           gpu_index, gpu_memory_allocated);
+      if (d_mem_acc_step_two)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
+                                           gpu_index, gpu_memory_allocated);
+      break;
+    case CG:
+      if (d_mem_acc_cg)
+        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
+                                           gpu_memory_allocated);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);
+  }
+};
+
 #endif // CUDA_MULTI_BIT_UTILITIES_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -240,7 +240,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
+template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
+
+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES})
+add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
+                                     pbs/programmable_bootstrap_multibit_128.cu)
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
  r = __ll2double_rn(x);
 }

+template <>
+__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
+                                                             double &r) {
+  // We truncate x
+  r = __ll2double_rn(static_cast<uint64_t>(x));
+}
+
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                          uint32_t level_count) {
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
  }
 }

+// params is expected to be full degree not half degree
+// same as convert_u128_to_f128_as_torus() but expects input to be on registers
+template <class params>
+__device__ void convert_u128_on_regs_to_f128_as_torus(
+    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
+    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
+
+  const double normalization = pow(2., -128.);
+  Index tid = threadIdx.x;
+  // #pragma unroll
+  for (Index i = 0; i < params::opt / 2; i++) {
+    auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
+    auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
+
+    out_re_hi[tid] = out_re.hi * normalization;
+    out_re_lo[tid] = out_re.lo * normalization;
+    out_im_hi[tid] = out_im.hi * normalization;
+    out_im_lo[tid] = out_im.lo * normalization;
+
+    tid += params::degree / params::opt;
+  }
+}
+
 template <class params>
 __device__ void
 convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
 // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
 // re_lo_1, im_hi_1, im_lo_1,
 // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
                           static_cast<cudaStream_t>(stream), gpu_index);
 }

+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor) {
+  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
+                               level_count * (1 << grouping_factor) /
+                               grouping_factor;
+  size_t buffer_size =
+      total_polynomials * polynomial_size * sizeof(__uint128_t);
+
+  cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
+                           static_cast<cudaStream_t>(stream), gpu_index);
+}
+
 // We need these lines so the compiler knows how to specialize these functions
 template __device__ const uint64_t *
 get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                    int glwe_dimension,
                                                    uint32_t level_count);

+template __device__ const __uint128_t *
+get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
+                                          int k, int level,
+                                          uint32_t grouping_factor,
+                                          uint32_t polynomial_size,
+                                          uint32_t glwe_dimension,
+                                          uint32_t level_count);
+
 template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
    const uint64_t *ptr, int g, int i, int k, int level,
    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
  __syncthreads();
 }

+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension z relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
+__device__ void mul_ggsw_glwe_in_fourier_domain_128(
+    double *fft, double *join_buffer,
+    const double *__restrict__ bootstrapping_key, int iteration, G &group,
+    bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.z;
+
+  // The first product is used to initialize level_join_buffer
+  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  auto bsk_slice = get_ith_mask_kth_block_128(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
+    int idx = (j + this_block_rank) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
+    auto buffer_slice = get_join_buffer_element_128<G>(
+        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_product_accumulate_in_fourier_domain_128<params>(
+        buffer_slice, fft, bsk_poly, j == 0);
+    group.sync();
+  }
+
+  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 0; l < level_count; l++) {
+    auto cur_src_acc = get_join_buffer_element_128<G>(
+        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
+                                                        l == 0);
+  }
+
+  __syncthreads();
+}
+
 template <typename Torus>
 void execute_pbs_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -280,8 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -18,62 +18,6 @@
 #include "programmable_bootstrap.cuh"
 #include "types/complex/operations.cuh"

-/** Perform the matrix multiplication between the GGSW and the GLWE,
- * each block operating on a single level for mask and body.
- * Both operands should be at fourier domain
- *
- * This function assumes:
- *  - Thread blocks at dimension z relates to the decomposition level.
- *  - Thread blocks at dimension y relates to the glwe dimension.
- *  - polynomial_size / params::opt threads are available per block
- */
-template <typename G, class params>
-__device__ void mul_ggsw_glwe_in_fourier_domain_128(
-    double *fft, double *join_buffer,
-    const double *__restrict__ bootstrapping_key, int iteration, G &group,
-    bool support_dsm = false) {
-  const uint32_t polynomial_size = params::degree;
-  const uint32_t glwe_dimension = gridDim.y - 1;
-  const uint32_t level_count = gridDim.z;
-
-  // The first product is used to initialize level_join_buffer
-  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  auto bsk_slice = get_ith_mask_kth_block_128(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
-      glwe_dimension, level_count);
-  for (int j = 0; j < glwe_dimension + 1; j++) {
-    int idx = (j + this_block_rank) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
-    auto buffer_slice = get_join_buffer_element_128<G>(
-        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_product_accumulate_in_fourier_domain_128<params>(
-        buffer_slice, fft, bsk_poly, j == 0);
-    group.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 0; l < level_count; l++) {
-    auto cur_src_acc = get_join_buffer_element_128<G>(
-        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
-                                                        l == 0);
-  }
-
-  __syncthreads();
-}
-
 template <typename InputTorus, class params, sharedMemDegree SMD,
          bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
@@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                               accumulator);
  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

-  // We are using the same memory space for accumulator_fft and
-  // accumulator_rotated, so we need to synchronize here to make sure they
-  // don't modify the same memory space at the same time
  // Switch to the FFT space
  auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
  auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 */
 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size) {
-
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
-          polynomial_size);
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle) {

  int max_blocks_per_sm;
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -521,8 +521,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -0,0 +1,361 @@
+#include "programmable_bootstrap_cg_multibit.cuh"
+#include "programmable_bootstrap_multibit_128.cuh"
+
+template <typename InputTorus>
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 512:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 1024:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 2048:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 4096:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename InputTorus>
+uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 512:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 1024:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 2048:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 4096:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<
+          __uint128_t>(glwe_dimension, polynomial_size, level_count,
+                       input_lwe_ciphertext_count,
+                       cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_cg)
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
+        stream, gpu_index,
+        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+    return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
+        stream, gpu_index,
+        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+template <typename InputTorus>
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename InputTorus>
+void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  if (base_log > 64)
+    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
+
+  auto *buffer =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::CG:
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
+        uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+                  static_cast<const uint64_t *>(lwe_output_indexes),
+                  static_cast<const __uint128_t *>(lut_vector),
+                  static_cast<const uint64_t *>(lut_vector_indexes),
+                  static_cast<const uint64_t *>(lwe_array_in),
+                  static_cast<const uint64_t *>(lwe_input_indexes),
+                  static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+                  lwe_dimension, glwe_dimension, polynomial_size,
+                  grouping_factor, base_log, level_count, num_samples,
+                  num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer) {
+  const auto x =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
+  x->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
+
+/**
+ * Computes divisors of the product of num_sms (streaming multiprocessors on the
+ * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
+ * device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
+ * root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
+ * suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
+ * - 13) for additional logic.
+ *
+ * The value 13 was empirically determined based on memory requirements for
+ * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
+ */
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle) {
+
+  int max_blocks_per_sm;
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  if (max_shared_memory < full_sm_keybundle)
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
+                                                              NOSM>,
+        polynomial_size / params::opt, full_sm_keybundle);
+  else
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
+                                                              FULLSM>,
+        polynomial_size / params::opt, 0);
+
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  int x = num_sms * max_blocks_per_sm;
+  int count = 0;
+
+  int divisor = 1;
+  int ith_divisor = 0;
+
+#if CUDA_ARCH < 900
+  // We pick a smaller divisor on GPUs other than H100, so 256-bit integer
+  // multiplication can run
+  int log2_max_num_pbs = log2_int(max_num_pbs);
+  if (log2_max_num_pbs > 13)
+    ith_divisor = log2_max_num_pbs - 11;
+#endif
+
+  for (int i = sqrt(x); i >= 1; i--) {
+    if (x % i == 0) {
+      if (count == ith_divisor) {
+        divisor = i;
+        break;
+      } else {
+        count++;
+      }
+    }
+  }
+
+  return divisor;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -283,8 +283,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -5,15 +5,15 @@
 #include <stdio.h>
 #include <type_traits>

-template <typename T> inline __device__ const char *get_format();
+template <typename T> __device__ inline const char *get_format();

-template <> inline __device__ const char *get_format<int>() { return "%d, "; }
+template <> __device__ inline const char *get_format<int>() { return "%d, "; }

-template <> inline __device__ const char *get_format<unsigned int>() {
+template <> __device__ inline const char *get_format<unsigned int>() {
  return "%u, ";
 }

-template <> inline __device__ const char *get_format<uint64_t>() {
+template <> __device__ inline const char *get_format<uint64_t>() {
  return "%lu, ";
 }

@@ -23,6 +23,15 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
  }
 }

+template <>
+__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
+  for (int i = 0; i < N; i++) {
+    uint64_t low = static_cast<uint64_t>(src[i]);
+    uint64_t high = static_cast<uint64_t>(src[i] >> 64);
+    printf("(%llu, %llu), ", high, low);
+  }
+}
+
 template <>
 __global__ inline void print_debug_kernel(const double2 *src, int N) {
  for (int i = 0; i < N; i++) {
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2188,6 +2188,19 @@ unsafe extern "C" {
        grouping_factor: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        dest: *mut ffi::c_void,
+        src: *const ffi::c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
        stream: *mut ffi::c_void,
@@ -2230,3 +2243,45 @@ unsafe extern "C" {
        pbs_buffer: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+    );
+}
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -165,7 +165,7 @@ fn pbs_128(c: &mut Criterion) {
 mod cuda {
    use benchmark::utilities::{
        cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
-        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord,
+        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
        CudaLocalKeys, OperatorType,
    };
    use criterion::{black_box, Criterion, Throughput};
@@ -173,12 +173,14 @@ mod cuda {
    use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
    use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
    use tfhe::core_crypto::gpu::{
+        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext,
        cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams,
    };
    use tfhe::core_crypto::prelude::*;
    use tfhe::shortint::engine::ShortintEngine;
    use tfhe::shortint::parameters::{
        ModulusSwitchType, NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
        PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    };
    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
@@ -441,14 +443,281 @@ mod cuda {
        );
    }

+    fn cuda_multi_bit_pbs_128(c: &mut Criterion) {
+        let bench_name = "core_crypto::cuda::multi_bit_pbs128";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(10)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        type Scalar = u128;
+        let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let squash_params =
+            NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+        let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
+        let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
+
+        let params_name = "PARAMS_SWITCH_SQUASH";
+
+        let mut boxed_seeder = new_seeder();
+        let seeder = boxed_seeder.as_mut();
+
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+
+        let input_lwe_secret_key =
+            LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);
+
+        let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
+            squash_params.glwe_dimension,
+            squash_params.polynomial_size,
+            &mut secret_generator,
+        );
+
+        let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+
+        let multi_bit_bsk = LweMultiBitBootstrapKey::new(
+            Scalar::ZERO,
+            squash_params.glwe_dimension.to_glwe_size(),
+            squash_params.polynomial_size,
+            squash_params.decomp_base_log,
+            squash_params.decomp_level_count,
+            input_params.lwe_dimension,
+            squash_params.grouping_factor,
+            squash_params.ciphertext_modulus,
+        );
+
+        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            .multi_bit_bootstrap_key(multi_bit_bsk)
+            .build();
+
+        let message_modulus: u64 = 1 << 4;
+        let input_message: u64 = 3;
+        let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
+        let plaintext = Plaintext(input_message * delta);
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let streams = CudaStreams::new_multi_gpu();
+                let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
+
+                let lwe_ciphertext_in: LweCiphertextOwned<u64> =
+                    allocate_and_encrypt_new_lwe_ciphertext(
+                        &input_lwe_secret_key,
+                        plaintext,
+                        lwe_noise_distribution_u64,
+                        ct_modulus_u64,
+                        &mut encryption_generator,
+                    );
+                let lwe_ciphertext_in_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams);
+
+                let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
+                    Scalar::ONE,
+                    squash_params.glwe_dimension.to_glwe_size(),
+                    squash_params.polynomial_size,
+                    squash_params.ciphertext_modulus,
+                );
+                let accumulator_gpu =
+                    CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
+
+                let out_pbs_ct = LweCiphertext::new(
+                    Scalar::ZERO,
+                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                    squash_params.ciphertext_modulus,
+                );
+                let mut out_pbs_ct_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
+
+                let h_indexes = [0];
+                let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
+
+                bench_id = format!("{bench_name}::{params_name}");
+                {
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                                &lwe_ciphertext_in_gpu,
+                                &mut out_pbs_ct_gpu,
+                                &accumulator_gpu,
+                                &cuda_indexes.d_lut,
+                                &cuda_indexes.d_output,
+                                &cuda_indexes.d_input,
+                                gpu_keys.multi_bit_bsk.as_ref().unwrap(),
+                                &streams,
+                            );
+                            black_box(&mut out_pbs_ct_gpu);
+                        })
+                    });
+                }
+            }
+            BenchmarkType::Throughput => {
+                let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
+                let gpu_count = get_number_of_gpus() as usize;
+
+                bench_id = format!("{bench_name}::throughput::{params_name}");
+                let blocks: usize = 1;
+                let elements = throughput_num_threads(blocks, 1);
+                let elements_per_stream = elements as usize / gpu_count;
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let local_streams = cuda_local_streams_core();
+
+                        let plaintext_list =
+                            PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));
+
+                        let input_cts = (0..gpu_count)
+                            .map(|i| {
+                                let mut input_ct_list = LweCiphertextList::new(
+                                    u64::ZERO,
+                                    input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    ct_modulus_u64,
+                                );
+
+                                encrypt_lwe_ciphertext_list(
+                                    &input_lwe_secret_key,
+                                    &mut input_ct_list,
+                                    &plaintext_list,
+                                    lwe_noise_distribution_u64,
+                                    &mut encryption_generator,
+                                );
+
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &input_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let accumulators = (0..gpu_count)
+                            .map(|i| {
+                                let accumulator = GlweCiphertextOwned::new(
+                                    Scalar::ONE,
+                                    squash_params.glwe_dimension.to_glwe_size(),
+                                    squash_params.polynomial_size,
+                                    squash_params.ciphertext_modulus,
+                                );
+                                CudaGlweCiphertextList::from_glwe_ciphertext(
+                                    &accumulator,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        // Allocate the LweCiphertext to store the result of the PBS
+                        let output_cts = (0..gpu_count)
+                            .map(|i| {
+                                let output_ct_list = LweCiphertextList::new(
+                                    Scalar::ZERO,
+                                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    squash_params.ciphertext_modulus,
+                                );
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &output_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let h_indexes = (0..(elements / gpu_count as u64))
+                            .map(CastFrom::cast_from)
+                            .collect::<Vec<_>>();
+                        let cuda_indexes_vec = (0..gpu_count)
+                            .map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
+                            .collect::<Vec<_>>();
+                        local_streams.iter().for_each(|stream| stream.synchronize());
+
+                        (
+                            input_cts,
+                            output_cts,
+                            accumulators,
+                            cuda_indexes_vec,
+                            local_streams,
+                        )
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(
+                            input_cts,
+                            mut output_cts,
+                            accumulators,
+                            cuda_indexes_vec,
+                            local_streams,
+                        )| {
+                            (0..gpu_count)
+                                .into_par_iter()
+                                .zip(input_cts.par_iter())
+                                .zip(output_cts.par_iter_mut())
+                                .zip(accumulators.par_iter())
+                                .zip(local_streams.par_iter())
+                                .for_each(
+                                    |((((i, input_ct), output_ct), accumulator), local_stream)| {
+                                        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                                            input_ct,
+                                            output_ct,
+                                            accumulator,
+                                            &cuda_indexes_vec[i].d_lut,
+                                            &cuda_indexes_vec[i].d_output,
+                                            &cuda_indexes_vec[i].d_input,
+                                            gpu_keys_vec[i].multi_bit_bsk.as_ref().unwrap(),
+                                            local_stream,
+                                        );
+                                    },
+                                )
+                        },
+                        criterion::BatchSize::SmallInput,
+                    );
+                });
+            }
+        };
+
+        let params_record = CryptoParametersRecord {
+            lwe_dimension: Some(input_params.lwe_dimension),
+            glwe_dimension: Some(squash_params.glwe_dimension),
+            polynomial_size: Some(squash_params.polynomial_size),
+            lwe_noise_distribution: Some(lwe_noise_distribution_u64),
+            glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
+            pbs_base_log: Some(squash_params.decomp_base_log),
+            pbs_level: Some(squash_params.decomp_level_count),
+            ciphertext_modulus: Some(input_params.ciphertext_modulus),
+            ..Default::default()
+        };
+
+        let bit_size = (message_modulus as u32).ilog2();
+        write_to_json(
+            &bench_id,
+            params_record,
+            params_name,
+            "pbs",
+            &OperatorType::Atomic,
+            bit_size,
+            vec![bit_size],
+        );
+    }
+
    pub fn cuda_pbs128_group() {
        let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
        cuda_pbs_128(&mut criterion);
    }
+
+    pub fn cuda_multi_bit_pbs128_group() {
+        let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
+        cuda_multi_bit_pbs_128(&mut criterion);
+    }
 }

 #[cfg(feature = "gpu")]
-use cuda::cuda_pbs128_group;
+use cuda::{cuda_multi_bit_pbs128_group, cuda_pbs128_group};

 pub fn pbs128_group() {
    let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
@@ -458,6 +727,7 @@ pub fn pbs128_group() {
 #[cfg(feature = "gpu")]
 fn go_through_gpu_bench_groups() {
    cuda_pbs128_group();
+    cuda_multi_bit_pbs128_group();
 }

 #[cfg(not(feature = "gpu"))]
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -521,7 +521,7 @@ mod cuda_utils {
        pub ksk: Option<CudaLweKeyswitchKey<T>>,
        pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
        pub bsk: Option<CudaLweBootstrapKey>,
-        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey>,
+        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
    }

    #[allow(dead_code)]
--- a/tfhe/examples/utilities/params_to_file.rs
+++ b/tfhe/examples/utilities/params_to_file.rs
@@ -13,6 +13,8 @@ use tfhe::shortint::parameters::current_params::{
    VEC_ALL_COMPRESSION_PARAMETERS, VEC_ALL_HPU_PARAMETERS, VEC_ALL_KS32_PARAMETERS,
    VEC_ALL_MULTI_BIT_PBS_PARAMETERS, VEC_ALL_NOISE_SQUASHING_PARAMETERS,
 };
+use tfhe::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
+use tfhe::shortint::parameters::v1_3::VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS;
 use tfhe::shortint::parameters::{
    CompactPublicKeyEncryptionParameters, CompressionParameters, NoiseSquashingParameters,
 };
@@ -214,6 +216,36 @@ impl ParamDetails<u128> for NoiseSquashingParameters {
    }
 }

+impl ParamDetails<u128> for NoiseSquashingMultiBitParameters {
+    fn lwe_dimension(&self) -> LweDimension {
+        panic!("lwe_dimension not applicable for NoiseSquashingMultiBitParameters")
+    }
+
+    fn glwe_dimension(&self) -> GlweDimension {
+        self.glwe_dimension
+    }
+
+    fn lwe_noise_distribution(&self) -> DynamicDistribution<u128> {
+        panic!("lwe_noise_distribution not applicable for NoiseSquashingMultiBitParameters")
+    }
+
+    fn glwe_noise_distribution(&self) -> DynamicDistribution<u128> {
+        self.glwe_noise_distribution
+    }
+
+    fn polynomial_size(&self) -> PolynomialSize {
+        self.polynomial_size
+    }
+
+    fn lwe_ciphertext_modulus(&self) -> ParamModulus {
+        panic!("lwe_ciphertext_modulus not applicable for NoiseSquashingMultiBitParameters")
+    }
+
+    fn glwe_ciphertext_modulus(&self) -> ParamModulus {
+        ParamModulus::from_ciphertext_modulus(self.ciphertext_modulus)
+    }
+}
+
 #[derive(Eq, PartialEq, Hash)]
 enum ParametersFormat {
    Lwe,
@@ -493,6 +525,16 @@ fn main() {
        ParametersFormat::Glwe,
    );

+    let noise_squasing_multi_bit_params: Vec<_> = VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS
+        .into_iter()
+        .map(|p| (*p.0, Some(p.1)))
+        .collect();
+    write_all_params_in_file(
+        "shortint_noise_squashing_multi_bit_parameters_lattice_estimator.sage",
+        &noise_squasing_multi_bit_params,
+        ParametersFormat::Glwe,
+    );
+
    let ks32_params: Vec<_> = VEC_ALL_KS32_PARAMETERS
        .into_iter()
        .map(|p| (AtomicPatternParameters::from(*p.0), Some(p.1)))
--- a/tfhe/src/core_crypto/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/algorithms/test/mod.rs
@@ -178,6 +178,36 @@ pub const DUMMY_31_U32: ClassicTestParams<u32> = ClassicTestParams {
    ciphertext_modulus: CiphertextModulus::new(1 << 31),
 };

+#[cfg(feature = "gpu")]
+pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+    NoiseSquashingMultiBitTestParameters<u128> = NoiseSquashingMultiBitTestParameters {
+    glwe_dimension: GlweDimension(2),
+    polynomial_size: PolynomialSize(2048),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
+    decomp_base_log: DecompositionBaseLog(23),
+    decomp_level_count: DecompositionLevelCount(3),
+    grouping_factor: LweBskGroupingFactor(4),
+    message_modulus_log: MessageModulusLog(4),
+    ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
+};
+
+#[cfg(feature = "gpu")]
+pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: MultiBitTestParams<
+    u64,
+> = MultiBitTestParams {
+    input_lwe_dimension: LweDimension(920),
+    lwe_noise_distribution: DynamicDistribution::new_t_uniform(45),
+    decomp_base_log: DecompositionBaseLog(22),
+    decomp_level_count: DecompositionLevelCount(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(2048),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(17),
+    message_modulus_log: MessageModulusLog(4),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+    grouping_factor: LweBskGroupingFactor(4),
+    thread_count: ThreadCount(5),
+};
+
 pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
    input_lwe_dimension: LweDimension(818),
    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
--- a/tfhe/src/core_crypto/algorithms/test/params.rs
+++ b/tfhe/src/core_crypto/algorithms/test/params.rs
@@ -3,6 +3,8 @@ use crate::core_crypto::entities::*;
 use crate::core_crypto::prelude::{CastFrom, CastInto, UnsignedInteger};
 use crate::keycache::NamedParam;
 #[cfg(feature = "gpu")]
+use crate::shortint::parameters::ModulusSwitchNoiseReductionParams;
+#[cfg(feature = "gpu")]
 use crate::shortint::parameters::ModulusSwitchType;
 use serde::{Deserialize, Serialize};

@@ -22,6 +24,13 @@ pub struct MultiBitBootstrapKeys<Scalar: UnsignedInteger> {
    pub fbsk: FourierLweMultiBitBootstrapKeyOwned,
 }

+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct MultiBitStdBootstrapKeys<Scalar: UnsignedInteger> {
+    pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub bsk: LweMultiBitBootstrapKeyOwned<Scalar>,
+}
+
 // Fourier key is generated afterward in order to use generic test function
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct FftBootstrapKeys<Scalar: UnsignedInteger> {
@@ -81,6 +90,18 @@ pub struct MultiBitTestParams<Scalar: UnsignedInteger> {
    pub thread_count: ThreadCount,
 }

+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct NoiseSquashingMultiBitTestParameters<Scalar: UnsignedInteger> {
+    pub glwe_dimension: GlweDimension,
+    pub polynomial_size: PolynomialSize,
+    pub glwe_noise_distribution: DynamicDistribution<Scalar>,
+    pub decomp_base_log: DecompositionBaseLog,
+    pub decomp_level_count: DecompositionLevelCount,
+    pub grouping_factor: LweBskGroupingFactor,
+    pub message_modulus_log: MessageModulusLog,
+    pub ciphertext_modulus: CiphertextModulus<Scalar>,
+}
+
 // PartialEq is implemented manually because thread_count doesn't affect key generation and we want
 // to change its value in test without the need of regenerating keys in the key cache.
 impl<Scalar: UnsignedInteger> PartialEq for MultiBitTestParams<Scalar> {
@@ -141,6 +162,21 @@ pub struct NoiseSquashingTestParams<Scalar: UnsignedInteger> {
    pub modulus_switch_noise_reduction_params: ModulusSwitchType,
    pub ciphertext_modulus: CiphertextModulus<Scalar>,
 }
+// Parameters to test NoiseSquashing implementation
+#[cfg(feature = "gpu")]
+#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
+pub struct NoiseSquashingMultiBitTestParams<Scalar: UnsignedInteger> {
+    pub lwe_dimension: LweDimension,
+    pub glwe_dimension: GlweDimension,
+    pub polynomial_size: PolynomialSize,
+    pub lwe_noise_distribution: DynamicDistribution<Scalar>,
+    pub glwe_noise_distribution: DynamicDistribution<Scalar>,
+    pub pbs_base_log: DecompositionBaseLog,
+    pub pbs_level: DecompositionLevelCount,
+    pub grouping_factor: LweBskGroupingFactor,
+    pub modulus_switch_noise_reduction_params: Option<ModulusSwitchNoiseReductionParams>,
+    pub ciphertext_modulus: CiphertextModulus<Scalar>,
+}

 #[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct PackingKeySwitchTestParams<Scalar: UnsignedInteger> {
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
@@ -17,7 +17,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
    lut_indexes: &CudaVec<Scalar>,
    output_indexes: &CudaVec<Scalar>,
    input_indexes: &CudaVec<Scalar>,
-    multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
    streams: &CudaStreams,
 ) where
    // CastInto required for PBS modulus switch which returns a usize
@@ -151,7 +151,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
    lut_indexes: &CudaVec<Scalar>,
    output_indexes: &CudaVec<Scalar>,
    input_indexes: &CudaVec<Scalar>,
-    multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
    streams: &CudaStreams,
 ) where
    // CastInto required for PBS modulus switch which returns a usize
@@ -171,3 +171,162 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
    }
    streams.synchronize();
 }
+
+/// # Safety
+///
+/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until streams is synchronised
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async<OutputScalar>(
+    input: &CudaLweCiphertextList<u64>,
+    output: &mut CudaLweCiphertextList<OutputScalar>,
+    accumulator: &CudaGlweCiphertextList<OutputScalar>,
+    lut_indexes: &CudaVec<u64>,
+    output_indexes: &CudaVec<u64>,
+    input_indexes: &CudaVec<u64>,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<OutputScalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    OutputScalar: UnsignedTorus + CastInto<usize>,
+{
+    assert_eq!(
+        input.lwe_dimension(),
+        multi_bit_bsk.input_lwe_dimension(),
+        "Mismatched input LweDimension. LweCiphertext input LweDimension {:?}. \
+        FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
+        input.lwe_dimension(),
+        multi_bit_bsk.input_lwe_dimension(),
+    );
+
+    assert_eq!(
+        output.lwe_dimension(),
+        multi_bit_bsk.output_lwe_dimension(),
+        "Mismatched output LweDimension. LweCiphertext output LweDimension {:?}. \
+        FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
+        output.lwe_dimension(),
+        multi_bit_bsk.output_lwe_dimension(),
+    );
+
+    assert_eq!(
+        accumulator.glwe_dimension(),
+        multi_bit_bsk.glwe_dimension(),
+        "Mismatched GlweSize. Accumulator GlweSize {:?}. \
+        FourierLweMultiBitBootstrapKey GlweSize {:?}.",
+        accumulator.glwe_dimension(),
+        multi_bit_bsk.glwe_dimension(),
+    );
+
+    assert_eq!(
+        accumulator.polynomial_size(),
+        multi_bit_bsk.polynomial_size(),
+        "Mismatched PolynomialSize. Accumulator PolynomialSize {:?}. \
+        FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
+        accumulator.polynomial_size(),
+        multi_bit_bsk.polynomial_size(),
+    );
+
+    assert_eq!(
+        output.ciphertext_modulus(),
+        accumulator.ciphertext_modulus(),
+        "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
+        input.ciphertext_modulus(),
+        accumulator.ciphertext_modulus(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        multi_bit_bsk.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        multi_bit_bsk.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        accumulator.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first accumulator pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        accumulator.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input_indexes.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output_indexes.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lut_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first lut indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        lut_indexes.gpu_index(0).get(),
+    );
+
+    programmable_bootstrap_multi_bit_async(
+        streams,
+        &mut output.0.d_vec,
+        output_indexes,
+        &accumulator.0.d_vec,
+        lut_indexes,
+        &input.0.d_vec,
+        input_indexes,
+        &multi_bit_bsk.d_vec,
+        input.lwe_dimension(),
+        multi_bit_bsk.glwe_dimension(),
+        multi_bit_bsk.polynomial_size(),
+        multi_bit_bsk.decomp_base_log(),
+        multi_bit_bsk.decomp_level_count(),
+        multi_bit_bsk.grouping_factor(),
+        input.lwe_ciphertext_count().0 as u32,
+    );
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext<Scalar>(
+    input: &CudaLweCiphertextList<u64>,
+    output: &mut CudaLweCiphertextList<Scalar>,
+    accumulator: &CudaGlweCiphertextList<Scalar>,
+    lut_indexes: &CudaVec<u64>,
+    output_indexes: &CudaVec<u64>,
+    input_indexes: &CudaVec<u64>,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    unsafe {
+        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async(
+            input,
+            output,
+            accumulator,
+            lut_indexes,
+            output_indexes,
+            input_indexes,
+            multi_bit_bsk,
+            streams,
+        );
+    }
+    streams.synchronize();
+}
--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs
@@ -0,0 +1,187 @@
+use super::*;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
+use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
+use crate::core_crypto::gpu::{
+    cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext, CudaStreams,
+};
+use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
+use itertools::Itertools;
+
+fn execute_multibit_bootstrap_u128(
+    squash_params: NoiseSquashingMultiBitTestParameters<u128>,
+    input_params: MultiBitTestParams<u64>,
+) {
+    let input_lwe_dimension = input_params.input_lwe_dimension;
+    let lwe_noise_distribution = input_params.lwe_noise_distribution;
+    let glwe_noise_distribution = squash_params.glwe_noise_distribution;
+    let ciphertext_modulus = squash_params.ciphertext_modulus;
+    let ciphertext_modulus_64 = CiphertextModulus::new_native();
+    let msg_modulus = input_params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+    let encoding_with_padding_64: u64 = get_encoding_with_padding(ciphertext_modulus_64);
+    let glwe_dimension = squash_params.glwe_dimension;
+    let polynomial_size = squash_params.polynomial_size;
+    let decomp_base_log = squash_params.decomp_base_log;
+    let decomp_level_count = squash_params.decomp_level_count;
+    let grouping_factor = squash_params.grouping_factor;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let mut rsc = TestResources::new();
+
+    let f = |x: u128| x % msg_modulus.0 as u128;
+
+    let delta = encoding_with_padding / msg_modulus.0 as u128;
+    let delta_64 = encoding_with_padding_64 / msg_modulus.0 as u64;
+    let mut msg = msg_modulus.0 as u64;
+    const NB_TESTS: usize = 10;
+    let number_of_messages = 1;
+
+    let accumulator = generate_programmable_bootstrap_glwe_lut(
+        polynomial_size,
+        glwe_dimension.to_glwe_size(),
+        msg_modulus.0.cast_into(),
+        ciphertext_modulus,
+        delta,
+        f,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &accumulator,
+        ciphertext_modulus
+    ));
+
+    // Create the LweSecretKey
+    let small_lwe_sk: LweSecretKeyOwned<u128> = allocate_and_generate_new_binary_lwe_secret_key(
+        input_lwe_dimension,
+        &mut rsc.secret_random_generator,
+    );
+    let input_lwe_secret_key = LweSecretKey::from_container(
+        small_lwe_sk
+            .clone()
+            .into_container()
+            .iter()
+            .copied()
+            .map(|x| x as u64)
+            .collect::<Vec<_>>(),
+    );
+    let output_glwe_secret_key: GlweSecretKeyOwned<u128> =
+        allocate_and_generate_new_binary_glwe_secret_key(
+            glwe_dimension,
+            polynomial_size,
+            &mut rsc.secret_random_generator,
+        );
+    let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+    let output_lwe_dimension = output_lwe_secret_key.lwe_dimension();
+
+    let mut bsk = LweMultiBitBootstrapKey::new(
+        0u128,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        decomp_base_log,
+        decomp_level_count,
+        input_lwe_dimension,
+        grouping_factor,
+        ciphertext_modulus,
+    );
+
+    par_generate_lwe_multi_bit_bootstrap_key(
+        &small_lwe_sk,
+        &output_glwe_secret_key,
+        &mut bsk,
+        glwe_noise_distribution,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &*bsk,
+        ciphertext_modulus
+    ));
+
+    let d_bsk = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(&bsk, &stream);
+
+    while msg != 0 {
+        msg -= 1;
+        for _ in 0..NB_TESTS {
+            let plaintext = Plaintext(msg * delta_64);
+
+            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
+                &input_lwe_secret_key,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus_64,
+                &mut rsc.encryption_random_generator,
+            );
+
+            assert!(check_encrypted_content_respects_mod(
+                &lwe_ciphertext_in,
+                ciphertext_modulus_64
+            ));
+
+            let d_lwe_ciphertext_in =
+                CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+            let mut d_out_pbs_ct = CudaLweCiphertextList::new(
+                output_lwe_dimension,
+                LweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+            let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            let mut test_vector_indexes: Vec<u64> = vec![0; number_of_messages];
+            for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
+                *ind = <usize as CastInto<u64>>::cast_into(i);
+            }
+
+            let mut d_test_vector_indexes =
+                unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
+            unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
+
+            let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
+            let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
+            let lwe_indexes = lwe_indexes_usize
+                .iter()
+                .map(|&x| <usize as CastInto<u64>>::cast_into(x))
+                .collect_vec();
+            let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
+            let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
+            unsafe {
+                d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+                d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+            }
+
+            cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                &d_lwe_ciphertext_in,
+                &mut d_out_pbs_ct,
+                &d_accumulator,
+                &d_test_vector_indexes,
+                &d_output_indexes,
+                &d_input_indexes,
+                &d_bsk,
+                &stream,
+            );
+
+            let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
+            assert!(check_encrypted_content_respects_mod(
+                &out_pbs_ct,
+                ciphertext_modulus
+            ));
+
+            let decrypted = decrypt_lwe_ciphertext(&output_lwe_secret_key, &out_pbs_ct);
+
+            let decoded = round_decode(decrypted.0, delta) % msg_modulus.0 as u128;
+            assert_eq!(decoded, f(msg as u128));
+        }
+    }
+}
+
+#[test]
+fn test_multibit_bootstrap_u128_with_squashing() {
+    execute_multibit_bootstrap_u128(
+        NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    );
+}
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -7,11 +7,13 @@ mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
+mod lwe_multi_bit_programmable_bootstrapping_128;
 mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
 mod lwe_programmable_bootstrapping_128;
 mod modulus_switch_noise_reduction;
 mod noise_distribution;
+
 pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
    pub lwe_sk: LweSecretKey<Vec<Scalar>>,
    pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
--- a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs
@@ -10,9 +10,9 @@ use crate::core_crypto::prelude::{

 /// A structure representing a vector of GLWE ciphertexts with 64 bits of precision on the GPU.
 #[derive(Debug)]
-pub struct CudaLweMultiBitBootstrapKey {
+pub struct CudaLweMultiBitBootstrapKey<Scalar: UnsignedInteger> {
    // Pointers to GPU data
-    pub(crate) d_vec: CudaVec<u64>,
+    pub(crate) d_vec: CudaVec<Scalar>,
    // Lwe dimension
    pub(crate) input_lwe_dimension: LweDimension,
    // Glwe dimension
@@ -27,14 +27,11 @@ pub struct CudaLweMultiBitBootstrapKey {
    pub(crate) grouping_factor: LweBskGroupingFactor,
 }

-impl CudaLweMultiBitBootstrapKey {
-    pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container>(
+impl<Scalar: UnsignedInteger> CudaLweMultiBitBootstrapKey<Scalar> {
+    pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container<Element = Scalar>>(
        bsk: &LweMultiBitBootstrapKey<InputBskCont>,
        streams: &CudaStreams,
-    ) -> Self
-    where
-        InputBskCont::Element: UnsignedInteger,
-    {
+    ) -> Self {
        let input_lwe_dimension = bsk.input_lwe_dimension();
        let polynomial_size = bsk.polynomial_size();
        let decomp_level_count = bsk.decomposition_level_count();
@@ -43,7 +40,7 @@ impl CudaLweMultiBitBootstrapKey {
        let grouping_factor = bsk.grouping_factor();

        // Allocate memory
-        let mut d_vec = CudaVec::<u64>::new_multi_gpu(
+        let mut d_vec = CudaVec::<InputBskCont::Element>::new_multi_gpu(
            lwe_multi_bit_bootstrap_key_size(
                input_lwe_dimension,
                glwe_dimension.to_glwe_size(),
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -14,9 +14,11 @@ use crate::core_crypto::prelude::{
 };
 pub use algorithms::*;
 pub use entities::*;
+use std::any::{Any, TypeId};
 use std::ffi::c_void;
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
+
 pub struct CudaStreams {
    pub ptr: Vec<*mut c_void>,
    pub gpu_indexes: Vec<GpuIndex>,
@@ -311,15 +313,18 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
+pub unsafe fn programmable_bootstrap_multi_bit_async<
+    T: UnsignedInteger,
+    B: Any + UnsignedInteger,
+>(
    streams: &CudaStreams,
-    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_out: &mut CudaVec<B>,
    output_indexes: &CudaVec<T>,
-    test_vector: &CudaVec<T>,
+    test_vector: &CudaVec<B>,
    test_vector_indexes: &CudaVec<T>,
    lwe_array_in: &CudaVec<T>,
    input_indexes: &CudaVec<T>,
-    bootstrapping_key: &CudaVec<u64>,
+    bootstrapping_key: &CudaVec<B>,
    lwe_dimension: LweDimension,
    glwe_dimension: GlweDimension,
    polynomial_size: PolynomialSize,
@@ -331,42 +336,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
    let num_many_lut = 1u32;
    let lut_stride = 0u32;
    let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_multi_bit_programmable_bootstrap_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        level.0 as u32,
-        num_samples,
-        true,
-    );
-    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        output_indexes.as_c_ptr(0),
-        test_vector.as_c_ptr(0),
-        test_vector_indexes.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        input_indexes.as_c_ptr(0),
-        bootstrapping_key.as_c_ptr(0),
-        pbs_buffer,
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        grouping_factor.0 as u32,
-        base_log.0 as u32,
-        level.0 as u32,
-        num_samples,
-        num_many_lut,
-        lut_stride,
-    );
-    cleanup_cuda_multi_bit_programmable_bootstrap(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-    );
+    if TypeId::of::<B>() == TypeId::of::<u128>() {
+        scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            output_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            input_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            grouping_factor.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+        cleanup_cuda_multi_bit_programmable_bootstrap_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else if TypeId::of::<B>() == TypeId::of::<u64>() {
+        scratch_cuda_multi_bit_programmable_bootstrap_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            output_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            input_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            grouping_factor.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+        cleanup_cuda_multi_bit_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else {
+        panic!("Unsupported torus size")
+    }
 }

 #[allow(clippy::too_many_arguments)]
@@ -607,9 +653,9 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: UnsignedInteger>(
+pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Any + UnsignedInteger>(
    streams: &CudaStreams,
-    dest: &mut CudaVec<u64>,
+    dest: &mut CudaVec<T>,
    src: &[T],
    input_lwe_dim: LweDimension,
    glwe_dim: GlweDimension,
@@ -620,17 +666,34 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
    let size = std::mem::size_of_val(src);
    for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
        assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
-        cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-            stream_ptr,
-            streams.gpu_indexes[i].get(),
-            dest.as_mut_c_ptr(i as u32),
-            src.as_ptr().cast(),
-            input_lwe_dim.0 as u32,
-            glwe_dim.0 as u32,
-            l_gadget.0 as u32,
-            polynomial_size.0 as u32,
-            grouping_factor.0 as u32,
-        );
+
+        if TypeId::of::<T>() == TypeId::of::<u128>() {
+            cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+            );
+        } else if TypeId::of::<T>() == TypeId::of::<u64>() {
+            cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+            );
+        } else {
+            panic!("Unsupported torus size for bsk conversion")
+        }
    }
 }

--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -30,7 +30,7 @@ pub struct CudaCompressionKey {
 }

 pub struct CudaDecompressionKey {
-    pub blind_rotate_key: CudaBootstrappingKey,
+    pub blind_rotate_key: CudaBootstrappingKey<u64>,
    pub lwe_per_glwe: LweCiphertextCount,
    pub glwe_dimension: GlweDimension,
    pub polynomial_size: PolynomialSize,
--- a/tfhe/src/integer/gpu/server_key/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/mod.rs
@@ -22,9 +22,9 @@ use crate::shortint::{CarryModulus, CiphertextModulus, MessageModulus, PBSOrder}

 mod radix;

-pub enum CudaBootstrappingKey {
+pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
    Classic(CudaLweBootstrapKey),
-    MultiBit(CudaLweMultiBitBootstrapKey),
+    MultiBit(CudaLweMultiBitBootstrapKey<Scalar>),
 }

 /// A structure containing the server public key.
@@ -34,7 +34,7 @@ pub enum CudaBootstrappingKey {
 // #[derive(PartialEq, Serialize, Deserialize)]
 pub struct CudaServerKey {
    pub key_switching_key: CudaLweKeyswitchKey<u64>,
-    pub bootstrapping_key: CudaBootstrappingKey,
+    pub bootstrapping_key: CudaBootstrappingKey<u64>,
    // Size of the message buffer
    pub message_modulus: MessageModulus,
    // Size of the carry buffer
--- a/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs
+++ b/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs
@@ -1,6 +1,6 @@
 use crate::core_crypto::prelude::*;
 use crate::shortint::parameters::noise_squashing::{
-    NoiseSquashingCompressionParameters, NoiseSquashingParameters,
+    NoiseSquashingCompressionParameters, NoiseSquashingMultiBitParameters, NoiseSquashingParameters,
 };
 use crate::shortint::parameters::{
    CoreCiphertextModulus, ModulusSwitchNoiseReductionParams, ModulusSwitchType,
@@ -69,3 +69,8 @@ pub enum NoiseSquashingParametersVersions {
 pub enum NoiseSquashingCompressionParametersVersions {
    V0(NoiseSquashingCompressionParameters),
 }
+
+#[derive(VersionsDispatch)]
+pub enum NoiseSquashingMultiBitParametersVersions {
+    V0(NoiseSquashingMultiBitParameters),
+}
--- a/tfhe/src/shortint/keycache.rs
+++ b/tfhe/src/shortint/keycache.rs
@@ -1,18 +1,19 @@
 use std::sync::LazyLock;

+use super::atomic_pattern::AtomicPatternParameters;
 use crate::keycache::utils::named_params_impl;
 use crate::keycache::*;
 #[cfg(tarpaulin)]
 use crate::shortint::parameters::coverage_parameters::*;
 use crate::shortint::parameters::current_params::*;
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
 use crate::shortint::parameters::parameters_wopbs::*;
+use crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 use crate::shortint::parameters::*;
 use crate::shortint::wopbs::WopbsKey;
 use crate::shortint::{ClientKey, KeySwitchingKey, ServerKey};
 use serde::{Deserialize, Serialize};

-use super::atomic_pattern::AtomicPatternParameters;
-
 named_params_impl!( ShortintParameterSet =>
    V1_3_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M128,
    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
@@ -492,6 +493,10 @@ named_params_impl!( NoiseSquashingParameters =>
    V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 );

+named_params_impl!( NoiseSquashingMultiBitParameters =>
+    V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+);
+
 impl From<AtomicPatternParameters> for (ClientKey, ServerKey) {
    fn from(param: AtomicPatternParameters) -> Self {
        let param_set = ShortintParameterSet::from(param);
--- a/tfhe/src/shortint/parameters/aliases.rs
+++ b/tfhe/src/shortint/parameters/aliases.rs
@@ -46,8 +46,13 @@ use current_params::multi_bit::tuniform::p_fail_2_minus_64::ks_pbs_gpu::{
 };
 use current_params::noise_squashing::p_fail_2_minus_128::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

-use super::current_params::V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use super::current_params::{
+    V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
 use super::NoiseSquashingCompressionParameters;
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
+
 // Aliases

 // Compute Gaussian
@@ -125,6 +130,10 @@ pub const NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
    NoiseSquashingParameters =
    V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

+pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+    NoiseSquashingMultiBitParameters =
+    V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
 // GPU 2^-64
 // GPU TUniform
 pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M64:
--- a/tfhe/src/shortint/parameters/noise_squashing.rs
+++ b/tfhe/src/shortint/parameters/noise_squashing.rs
@@ -1,3 +1,4 @@
+use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::shortint::backward_compatibility::parameters::noise_squashing::*;
 use crate::shortint::parameters::{
    CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
@@ -34,3 +35,17 @@ pub struct NoiseSquashingCompressionParameters {
    pub carry_modulus: CarryModulus,
    pub ciphertext_modulus: CoreCiphertextModulus<u128>,
 }
+
+#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize, Versionize)]
+#[versionize(NoiseSquashingMultiBitParametersVersions)]
+pub struct NoiseSquashingMultiBitParameters {
+    pub glwe_dimension: GlweDimension,
+    pub polynomial_size: PolynomialSize,
+    pub glwe_noise_distribution: DynamicDistribution<u128>,
+    pub decomp_base_log: DecompositionBaseLog,
+    pub decomp_level_count: DecompositionLevelCount,
+    pub grouping_factor: LweBskGroupingFactor,
+    pub message_modulus: MessageModulus,
+    pub carry_modulus: CarryModulus,
+    pub ciphertext_modulus: CoreCiphertextModulus<u128>,
+}
--- a/tfhe/src/shortint/parameters/v1_3/mod.rs
+++ b/tfhe/src/shortint/parameters/v1_3/mod.rs
@@ -43,6 +43,7 @@ pub use noise_squashing::p_fail_2_minus_128::*;
 #[cfg(feature = "hpu")]
 pub use hpu::*;

+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
 use crate::shortint::parameters::{
    ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
    KeySwitch32PBSParameters, MultiBitPBSParameters, NoiseSquashingCompressionParameters,
@@ -1700,6 +1701,14 @@ pub const VEC_ALL_NOISE_SQUASHING_PARAMETERS: [(&NoiseSquashingParameters, &str)
    ),
 ];

+pub const VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS: [(
+    &NoiseSquashingMultiBitParameters,
+    &str,
+); 1] = [(
+    &V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    "V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128",
+)];
+
 /// All [`NoiseSquashingCompressionParameters`] in this module.
 pub const VEC_ALL_NOISE_SQUASHING_COMPRESSION_PARAMETERS: [(
    &NoiseSquashingCompressionParameters,
--- a/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs
+++ b/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs
@@ -1,6 +1,7 @@
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
 use crate::shortint::parameters::{
    CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
-    DynamicDistribution, GlweDimension, LweCiphertextCount, MessageModulus,
+    DynamicDistribution, GlweDimension, LweBskGroupingFactor, LweCiphertextCount, MessageModulus,
    ModulusSwitchNoiseReductionParams, ModulusSwitchType, NoiseEstimationMeasureBound,
    NoiseSquashingCompressionParameters, NoiseSquashingParameters, PolynomialSize, RSigmaFactor,
    Variance,
@@ -58,3 +59,16 @@ pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
    carry_modulus: CarryModulus(4),
    ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
 };
+
+pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+    NoiseSquashingMultiBitParameters = NoiseSquashingMultiBitParameters {
+    glwe_dimension: GlweDimension(2),
+    polynomial_size: PolynomialSize(2048),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
+    decomp_base_log: DecompositionBaseLog(23),
+    decomp_level_count: DecompositionLevelCount(3),
+    grouping_factor: LweBskGroupingFactor(4),
+    message_modulus: MessageModulus(4),
+    carry_modulus: CarryModulus(4),
+    ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
+};