diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
index 0e31e2c0c..d6f08067d 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t num_many_lut, uint32_t lut_stride);
 
 template <typename Torus>
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
+    uint32_t polynomial_size);
+template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
     uint32_t polynomial_size);
 template <typename Torus>
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
 
 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size);
-
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle);
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle);
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
   int8_t *d_mem_keybundle = NULL;
   int8_t *d_mem_acc_step_one = NULL;
@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
   }
 };
 
+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
+  int8_t *d_mem_keybundle = NULL;
+  int8_t *d_mem_acc_step_one = NULL;
+  int8_t *d_mem_acc_step_two = NULL;
+  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;
+  uint32_t lwe_chunk_size;
+  double *keybundle_fft;
+  __uint128_t *global_accumulator;
+  double *global_join_buffer;
+
+  PBS_VARIANT pbs_variant;
+  bool gpu_memory_allocated;
+
+  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+                 uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
+                 bool allocate_gpu_memory, uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    cuda_set_device(gpu_index);
+
+    this->pbs_variant = pbs_variant;
+    this->lwe_chunk_size = lwe_chunk_size;
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    // default
+    uint64_t full_sm_keybundle =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_one =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_two =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
+            __uint128_t>(polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    // cg
+    uint64_t full_sm_cg_accumulate =
+        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
+            polynomial_size);
+    uint64_t partial_sm_cg_accumulate =
+        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
+            __uint128_t>(polynomial_size);
+
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+
+    // Keybundle
+    if (max_shared_memory < full_sm_keybundle)
+      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
+          size_tracker, allocate_gpu_memory);
+
+    switch (pbs_variant) {
+    case PBS_VARIANT::CG:
+      // Accumulator CG
+      if (max_shared_memory < partial_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      break;
+    case PBS_VARIANT::DEFAULT:
+      // Accumulator step one
+      if (max_shared_memory < partial_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+
+      // Accumulator step two
+      if (max_shared_memory < full_sm_accumulate_step_two)
+        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
+        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(__uint128_t),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+
+    if (d_mem_keybundle)
+      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
+                                         gpu_memory_allocated);
+    switch (pbs_variant) {
+    case DEFAULT:
+      if (d_mem_acc_step_one)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
+                                           gpu_index, gpu_memory_allocated);
+      if (d_mem_acc_step_two)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
+                                           gpu_index, gpu_memory_allocated);
+      break;
+    case CG:
+      if (d_mem_acc_cg)
+        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
+                                           gpu_memory_allocated);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);
+  }
+};
+
 #endif // CUDA_MULTI_BIT_UTILITIES_H
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
index e9db8c86e..4b35c8acd 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -240,7 +240,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
   }
 };
 
-template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
+template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
+
+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
   int8_t *d_mem;
 
   __uint128_t *global_accumulator;
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
index fcca1ce5e..68f2e2719 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
     uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
     uint32_t polynomial_size, uint32_t grouping_factor);
 
+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                    uint32_t gpu_index,
                                                    int8_t **pbs_buffer);
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer);
 }
 
 #endif // CUDA_MULTI_BIT_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
index 5fec699fb..f0924d411 100644
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES})
+add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
+                                     pbs/programmable_bootstrap_multibit_128.cu)
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
index 6080a5ffc..9d0205197 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
   r = __ll2double_rn(x);
 }
 
+template <>
+__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
+                                                             double &r) {
+  // We truncate x
+  r = __ll2double_rn(static_cast<uint64_t>(x));
+}
+
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                           uint32_t level_count) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
index ab6ad0484..15bbd03c6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
   }
 }
 
+// params is expected to be full degree not half degree
+// same as convert_u128_to_f128_as_torus() but expects input to be on registers
+template <class params>
+__device__ void convert_u128_on_regs_to_f128_as_torus(
+    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
+    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
+
+  const double normalization = pow(2., -128.);
+  Index tid = threadIdx.x;
+  // #pragma unroll
+  for (Index i = 0; i < params::opt / 2; i++) {
+    auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
+    auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
+
+    out_re_hi[tid] = out_re.hi * normalization;
+    out_re_lo[tid] = out_re.lo * normalization;
+    out_im_hi[tid] = out_im.hi * normalization;
+    out_im_lo[tid] = out_im.lo * normalization;
+
+    tid += params::degree / params::opt;
+  }
+}
+
 template <class params>
 __device__ void
 convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }
 
 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
 // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
 }
 
 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
 // re_lo_1, im_hi_1, im_lo_1,
 // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
index ef6e651a7..b00182131 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
                            static_cast<cudaStream_t>(stream), gpu_index);
 }
 
+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor) {
+  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
+                               level_count * (1 << grouping_factor) /
+                               grouping_factor;
+  size_t buffer_size =
+      total_polynomials * polynomial_size * sizeof(__uint128_t);
+
+  cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
+                           static_cast<cudaStream_t>(stream), gpu_index);
+}
+
 // We need these lines so the compiler knows how to specialize these functions
 template __device__ const uint64_t *
 get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                     int glwe_dimension,
                                                     uint32_t level_count);
 
+template __device__ const __uint128_t *
+get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
+                                          int k, int level,
+                                          uint32_t grouping_factor,
+                                          uint32_t polynomial_size,
+                                          uint32_t glwe_dimension,
+                                          uint32_t level_count);
+
 template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
     const uint64_t *ptr, int g, int i, int k, int level,
     uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 3137b0f87..17235618b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
   __syncthreads();
 }
 
+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension z relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
+__device__ void mul_ggsw_glwe_in_fourier_domain_128(
+    double *fft, double *join_buffer,
+    const double *__restrict__ bootstrapping_key, int iteration, G &group,
+    bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.z;
+
+  // The first product is used to initialize level_join_buffer
+  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  auto bsk_slice = get_ith_mask_kth_block_128(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
+    int idx = (j + this_block_rank) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
+    auto buffer_slice = get_join_buffer_element_128<G>(
+        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_product_accumulate_in_fourier_domain_128<params>(
+        buffer_slice, fft, bsk_poly, j == 0);
+    group.sync();
+  }
+
+  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 0; l < level_count; l++) {
+    auto cur_src_acc = get_join_buffer_element_128<G>(
+        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
+                                                        l == 0);
+  }
+
+  __syncthreads();
+}
+
 template <typename Torus>
 void execute_pbs_async(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 57e37b424..e9362114b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -280,8 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
     check_cuda_error(cudaGetLastError());
   }
 
-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
   uint64_t size_tracker = 0;
   *buffer = new pbs_buffer<Torus, MULTI_BIT>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
index 73770f39e..dc3068cc8 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -18,62 +18,6 @@
 #include "programmable_bootstrap.cuh"
 #include "types/complex/operations.cuh"
 
-/** Perform the matrix multiplication between the GGSW and the GLWE,
- * each block operating on a single level for mask and body.
- * Both operands should be at fourier domain
- *
- * This function assumes:
- *  - Thread blocks at dimension z relates to the decomposition level.
- *  - Thread blocks at dimension y relates to the glwe dimension.
- *  - polynomial_size / params::opt threads are available per block
- */
-template <typename G, class params>
-__device__ void mul_ggsw_glwe_in_fourier_domain_128(
-    double *fft, double *join_buffer,
-    const double *__restrict__ bootstrapping_key, int iteration, G &group,
-    bool support_dsm = false) {
-  const uint32_t polynomial_size = params::degree;
-  const uint32_t glwe_dimension = gridDim.y - 1;
-  const uint32_t level_count = gridDim.z;
-
-  // The first product is used to initialize level_join_buffer
-  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  auto bsk_slice = get_ith_mask_kth_block_128(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
-      glwe_dimension, level_count);
-  for (int j = 0; j < glwe_dimension + 1; j++) {
-    int idx = (j + this_block_rank) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
-    auto buffer_slice = get_join_buffer_element_128<G>(
-        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_product_accumulate_in_fourier_domain_128<params>(
-        buffer_slice, fft, bsk_poly, j == 0);
-    group.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 0; l < level_count; l++) {
-    auto cur_src_acc = get_join_buffer_element_128<G>(
-        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
-                                                        l == 0);
-  }
-
-  __syncthreads();
-}
-
 template <typename InputTorus, class params, sharedMemDegree SMD,
           bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
@@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                                accumulator);
   gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
 
-  // We are using the same memory space for accumulator_fft and
-  // accumulator_rotated, so we need to synchronize here to make sure they
-  // don't modify the same memory space at the same time
   // Switch to the FFT space
   auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
   auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index a1d1dc5af..ef4506ed9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
  */
 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size) {
-
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
-          polynomial_size);
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle) {
 
   int max_blocks_per_sm;
   auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index f80ca5f16..44d31fa84 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -521,8 +521,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
     check_cuda_error(cudaGetLastError());
   }
 
-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
   uint64_t size_tracker = 0;
   *buffer = new pbs_buffer<Torus, MULTI_BIT>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
new file mode 100644
index 000000000..3297f0761
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -0,0 +1,361 @@
+#include "programmable_bootstrap_cg_multibit.cuh"
+#include "programmable_bootstrap_multibit_128.cuh"
+
+template <typename InputTorus>
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 512:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 1024:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 2048:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 4096:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename InputTorus>
+uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 512:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 1024:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 2048:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 4096:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<
+          __uint128_t>(glwe_dimension, polynomial_size, level_count,
+                       input_lwe_ciphertext_count,
+                       cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_cg)
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
+        stream, gpu_index,
+        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+    return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
+        stream, gpu_index,
+        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+template <typename InputTorus>
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename InputTorus>
+void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  if (base_log > 64)
+    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
+
+  auto *buffer =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::CG:
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
+        uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+                  static_cast<const uint64_t *>(lwe_output_indexes),
+                  static_cast<const __uint128_t *>(lut_vector),
+                  static_cast<const uint64_t *>(lut_vector_indexes),
+                  static_cast<const uint64_t *>(lwe_array_in),
+                  static_cast<const uint64_t *>(lwe_input_indexes),
+                  static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+                  lwe_dimension, glwe_dimension, polynomial_size,
+                  grouping_factor, base_log, level_count, num_samples,
+                  num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer) {
+  const auto x =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
+  x->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
+
+/**
+ * Computes divisors of the product of num_sms (streaming multiprocessors on the
+ * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
+ * device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
+ * root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
+ * suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
+ * - 13) for additional logic.
+ *
+ * The value 13 was empirically determined based on memory requirements for
+ * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
+ */
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle) {
+
+  int max_blocks_per_sm;
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  if (max_shared_memory < full_sm_keybundle)
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
+                                                              NOSM>,
+        polynomial_size / params::opt, full_sm_keybundle);
+  else
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
+                                                              FULLSM>,
+        polynomial_size / params::opt, 0);
+
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  int x = num_sms * max_blocks_per_sm;
+  int count = 0;
+
+  int divisor = 1;
+  int ith_divisor = 0;
+
+#if CUDA_ARCH < 900
+  // We pick a smaller divisor on GPUs other than H100, so 256-bit integer
+  // multiplication can run
+  int log2_max_num_pbs = log2_int(max_num_pbs);
+  if (log2_max_num_pbs > 13)
+    ith_divisor = log2_max_num_pbs - 11;
+#endif
+
+  for (int i = sqrt(x); i >= 1; i--) {
+    if (x % i == 0) {
+      if (count == ith_divisor) {
+        divisor = i;
+        break;
+      } else {
+        count++;
+      }
+    }
+  }
+
+  return divisor;
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
new file mode 100644
index 000000000..ca2f2c604
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -0,0 +1,1101 @@
+#ifndef PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
+#define PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "fft128/fft128.cuh"
+#include "pbs/pbs_multibit_utilities.h"
+#include "programmable_bootstrap_multibit.cuh"
+#include "utils/helper.cuh"
+
+template <typename Torus>
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
+    uint32_t polynomial_size) {
+  return sizeof(__uint128_t) * polynomial_size * 2; // accumulator
+}
+
+template <typename InputTorus, class params, sharedMemDegree SMD>
+__global__ void device_multi_bit_programmable_bootstrap_keybundle_128(
+    const InputTorus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array,
+    const __uint128_t *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    uint32_t keybundle_size_per_input, int8_t *device_mem,
+    uint64_t device_memory_size_per_block) {
+
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  // Ids
+  uint32_t level_id = blockIdx.z;
+  uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1);
+  uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
+  uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
+  uint32_t input_idx = blockIdx.x / lwe_chunk_size;
+
+  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
+
+    auto block_lwe_array_in =
+        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
+    auto keybundle = &keybundle_array[
+        // select the input
+        input_idx * keybundle_size_per_input];
+
+    ////////////////////////////////////////////////////////////
+    // Computes all keybundles
+    uint32_t rev_lwe_iteration =
+        (lwe_dimension / grouping_factor) - lwe_iteration - 1;
+
+    // ////////////////////////////////
+    // Keygen guarantees the first term is a constant term of the polynomial, no
+    // polynomial multiplication required
+    auto bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
+        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
+    auto bsk_poly_ini = bsk_slice + poly_id * params::degree;
+
+    __uint128_t reg_acc[params::opt];
+
+    copy_polynomial_in_regs<__uint128_t, params::opt,
+                            params::degree / params::opt>(bsk_poly_ini,
+                                                          reg_acc);
+
+    int offset =
+        get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
+
+    // Precalculate the monomial degrees and store them in shared memory
+    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
+    if (threadIdx.x < (1 << grouping_factor)) {
+      auto lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] =
+          calculates_monomial_degree<InputTorus, params>(
+              lwe_array_group, threadIdx.x, grouping_factor);
+    }
+    __syncthreads();
+
+    // Accumulate the other terms
+    for (int g = 1; g < (1 << grouping_factor); g++) {
+      uint32_t monomial_degree = monomial_degrees[g];
+
+      auto bsk_poly = bsk_poly_ini + g * offset;
+      // Multiply by the bsk element
+      polynomial_accumulate_monic_monomial_mul_on_regs<__uint128_t, params>(
+          reg_acc, bsk_poly, monomial_degree);
+    }
+    __syncthreads(); // needed because we are going to reuse the
+                     // shared memory for the fft
+
+    // Move from local memory back to shared memory but as complex
+    double *fft = (double *)selected_memory;
+
+    auto fft_re_hi = &fft[0 * params::degree / 2];
+    auto fft_re_lo = &fft[1 * params::degree / 2];
+    auto fft_im_hi = &fft[2 * params::degree / 2];
+    auto fft_im_lo = &fft[3 * params::degree / 2];
+
+    convert_u128_on_regs_to_f128_as_torus<params>(
+        fft_re_hi, fft_re_lo, fft_im_hi, fft_im_lo, &reg_acc[0],
+        &reg_acc[params::opt / 2]);
+
+    __syncthreads(); // TODO: Do we need this sync?
+
+    negacyclic_forward_fft_f128<HalfDegree<params>>(fft_re_hi, fft_re_lo,
+                                                    fft_im_hi, fft_im_lo);
+
+    // lwe iteration
+    auto keybundle_out = get_ith_mask_kth_block_128(
+        keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
+        polynomial_size, glwe_dimension, level_count);
+    auto keybundle_poly = keybundle_out + poly_id * (params::degree / 2) * 4;
+
+    copy_polynomial<double, 2 * params::opt, params::degree / params::opt>(
+        fft, keybundle_poly);
+  }
+}
+
+//////////////////////////////////////////////////////////////////
+/////////////////// START DEFAULT ///////////////////////////////
+template <typename InputTorus, class params, sharedMemDegree SMD,
+          bool is_first_iter>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_accumulate_step_one_128(
+        const InputTorus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_input_indexes,
+        const __uint128_t *__restrict__ lut_vector,
+        const InputTorus *__restrict__ lut_vector_indexes,
+        __uint128_t *global_accumulator, double *global_accumulator_fft,
+        uint32_t lwe_dimension, uint32_t glwe_dimension,
+        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+        int8_t *device_mem, uint64_t device_memory_size_per_block) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+
+  int8_t *selected_memory = sharedmem;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.z + blockIdx.y * gridDim.z +
+                      blockIdx.x * gridDim.z * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  auto accumulator = reinterpret_cast<__uint128_t *>(selected_memory);
+  auto accumulator_fft =
+      reinterpret_cast<double *>(accumulator) +
+      static_cast<ptrdiff_t>(sizeof(__uint128_t) * polynomial_size /
+                             sizeof(double));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = reinterpret_cast<double *>(sharedmem);
+
+  auto block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+
+  auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
+                                      params::degree * (glwe_dimension + 1)];
+
+  auto global_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
+                          params::degree];
+
+  auto global_fft_slice =
+      &global_accumulator_fft[(blockIdx.y + blockIdx.z * (glwe_dimension + 1) +
+                               blockIdx.x * level_count *
+                                   (glwe_dimension + 1)) *
+                              (params::degree / 2) * 4];
+
+  if constexpr (is_first_iter) {
+    // First iteration
+    ////////////////////////////////////////////////////////////
+    // Initializes the accumulator with the body of LWE
+    // Put "b" in [0, 2N[
+    InputTorus b_hat = 0;
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);
+
+    divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+
+    // Persist
+    copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>(
+        accumulator, global_slice);
+  } else {
+    // Load the accumulator calculated in previous iterations
+    copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>(
+        global_slice, accumulator);
+  }
+
+  // Perform a rounding to increase the accuracy of the
+  // bootstrapped ciphertext
+  init_decomposer_state_inplace<__uint128_t, params::opt,
+                                params::degree / params::opt>(
+      accumulator, base_log, level_count);
+
+  // Decompose the accumulator. Each block gets one level of the
+  // decomposition, for the mask and the body (so block 0 will have the
+  // accumulator decomposed at level 0, 1 at 1, etc.)
+  GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
+                                               accumulator);
+  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
+
+  // Switch to the FFT space
+  auto acc_fft_re_hi = &accumulator_fft[0 * params::degree / 2];
+  auto acc_fft_re_lo = &accumulator_fft[1 * params::degree / 2];
+  auto acc_fft_im_hi = &accumulator_fft[2 * params::degree / 2];
+  auto acc_fft_im_lo = &accumulator_fft[3 * params::degree / 2];
+
+  negacyclic_forward_fft_f128<HalfDegree<params>>(acc_fft_re_hi, acc_fft_re_lo,
+                                                  acc_fft_im_hi, acc_fft_im_lo);
+
+  copy_polynomial<double, 2 * params::opt, params::degree / params::opt>(
+      accumulator_fft, global_fft_slice);
+}
+template <typename InputTorus, class params, sharedMemDegree SMD,
+          bool is_last_iter>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_accumulate_step_two_128(
+        __uint128_t *lwe_array_out,
+        const InputTorus *__restrict__ lwe_output_indexes,
+        const double *__restrict__ keybundle_array,
+        __uint128_t *global_accumulator, double *global_accumulator_fft,
+        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+        uint32_t iteration, uint32_t lwe_chunk_size, int8_t *device_mem,
+        uint64_t device_memory_size_per_block, uint32_t num_many_lut,
+        uint32_t lut_stride) {
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  selected_memory = sharedmem;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  auto accumulator_fft = reinterpret_cast<double *>(selected_memory);
+
+  auto keybundle =
+      &keybundle_array[blockIdx.x * lwe_chunk_size * level_count *
+                       (glwe_dimension + 1) * (glwe_dimension + 1) *
+                       (params::degree / 2) * 4];
+
+  auto global_accumulator_fft_chunk =
+      &global_accumulator_fft[blockIdx.x * level_count * (glwe_dimension + 1) *
+                              (params::degree / 2) * 4];
+
+  for (int level = 0; level < level_count; level++) {
+    auto global_fft_slice =
+        &global_accumulator_fft_chunk[level * (glwe_dimension + 1) *
+                                      (params::degree / 2) * 4];
+
+    for (int j = 0; j < (glwe_dimension + 1); j++) {
+      auto fft = &global_fft_slice[j * (params::degree / 2) * 4];
+
+      // Get the bootstrapping key piece necessary for the multiplication
+      // It is already in the Fourier domain
+      auto bsk_slice = get_ith_mask_kth_block_128(keybundle, iteration, j,
+                                                  level, params::degree,
+                                                  glwe_dimension, level_count);
+      auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2 * 4];
+
+      polynomial_product_accumulate_in_fourier_domain_128<params>(
+          accumulator_fft, fft, bsk_poly, !level && !j);
+    }
+  }
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  auto acc_fft_re_hi = &accumulator_fft[0 * params::degree / 2];
+  auto acc_fft_re_lo = &accumulator_fft[1 * params::degree / 2];
+  auto acc_fft_im_hi = &accumulator_fft[2 * params::degree / 2];
+  auto acc_fft_im_lo = &accumulator_fft[3 * params::degree / 2];
+
+  negacyclic_backward_fft_f128<HalfDegree<params>>(
+      acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
+  auto global_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
+                          params::degree];
+
+  add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
+                                        acc_fft_im_hi, acc_fft_im_lo,
+                                        global_slice, true);
+  __syncthreads();
+
+  if constexpr (is_last_iter) {
+    // Last iteration
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
+                                               global_slice);
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<__uint128_t, params>(
+              next_block_lwe_array_out, global_slice, 1, i * lut_stride);
+        }
+      }
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<__uint128_t, params>(block_lwe_array_out,
+                                               global_slice, 0);
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<__uint128_t, params>(
+              next_block_lwe_array_out, global_slice, 0, i * lut_stride);
+        }
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////
+/////////////////// END DEFAULT //////////////////////////////////
+
+//////////////////////////////////////////////////////////////////
+/////////////////// START CG /////////////////////////////////////
+
+template <typename InputTorus, class params, sharedMemDegree SMD>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_multi_bit_programmable_bootstrap_cg_accumulate_128(
+        __uint128_t *lwe_array_out,
+        const InputTorus *__restrict__ lwe_output_indexes,
+        const __uint128_t *__restrict__ lut_vector,
+        const InputTorus *__restrict__ lut_vector_indexes,
+        const InputTorus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_input_indexes,
+        const double *__restrict__ keybundle_array, double *join_buffer,
+        __uint128_t *global_accumulator, uint32_t lwe_dimension,
+        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
+        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
+        int8_t *device_mem, uint64_t device_memory_size_per_block,
+        uint32_t num_many_lut, uint32_t lut_stride) {
+
+  grid_group grid = this_grid();
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.z + blockIdx.y * gridDim.z +
+                      blockIdx.x * gridDim.z * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  auto accumulator_rotated = reinterpret_cast<__uint128_t *>(selected_memory);
+  auto accumulator_fft =
+      reinterpret_cast<double *>(accumulator_rotated) +
+      static_cast<ptrdiff_t>(sizeof(__uint128_t) * polynomial_size /
+                             sizeof(double));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = reinterpret_cast<double *>(sharedmem);
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  auto block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+
+  auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
+                                      params::degree * (glwe_dimension + 1)];
+
+  auto block_join_buffer =
+      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
+                   (params::degree / 2) * 4];
+
+  auto global_accumulator_slice =
+      &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
+                          params::degree];
+
+  auto keybundle = &keybundle_array[blockIdx.x * keybundle_size_per_input];
+
+  if (lwe_offset == 0) {
+    // Put "b" in [0, 2N[
+    InputTorus b_hat = 0;
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);
+
+    divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
+                                          params::degree / params::opt>(
+        accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
+        b_hat, false);
+  } else {
+    // Load the accumulator_rotated calculated in previous iterations
+    copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>(
+        global_accumulator_slice, accumulator_rotated);
+  }
+
+  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    init_decomposer_state_inplace<__uint128_t, params::opt,
+                                  params::degree / params::opt>(
+        accumulator_rotated, base_log, level_count);
+
+    // Decompose the accumulator_rotated. Each block gets one level of the
+    // decomposition, for the mask and the body (so block 0 will have the
+    // accumulator_rotated decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
+                                                 accumulator_rotated);
+    gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
+
+    // Switch to the FFT space
+    auto acc_fft_re_hi = &accumulator_fft[0 * params::degree / 2];
+    auto acc_fft_re_lo = &accumulator_fft[1 * params::degree / 2];
+    auto acc_fft_im_hi = &accumulator_fft[2 * params::degree / 2];
+    auto acc_fft_im_lo = &accumulator_fft[3 * params::degree / 2];
+
+    negacyclic_forward_fft_f128<HalfDegree<params>>(
+        acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
+
+    __syncthreads();
+
+    // Perform G^-1(ACC) * GGSW -> GLWE
+    mul_ggsw_glwe_in_fourier_domain_128<grid_group, params>(
+        accumulator_fft, block_join_buffer, keybundle, i, grid);
+
+    negacyclic_backward_fft_f128<HalfDegree<params>>(
+        acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
+    __syncthreads();
+
+    add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
+                                          acc_fft_im_hi, acc_fft_im_lo,
+                                          accumulator_rotated, true);
+  }
+
+  auto accumulator = accumulator_rotated;
+
+  if (blockIdx.z == 0) {
+    if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+      auto block_lwe_array_out =
+          &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                             (glwe_dimension * polynomial_size + 1) +
+                         blockIdx.y * polynomial_size];
+
+      if (blockIdx.y < glwe_dimension) {
+        // Perform a sample extract. At this point, all blocks have the result,
+        // but we do the computation at block 0 to avoid waiting for extra
+        // blocks, in case they're not synchronized Always extract one by
+        // default
+        sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
+                                                 accumulator);
+
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_mask<__uint128_t, params>(
+                next_block_lwe_array_out, accumulator, 1, i * lut_stride);
+          }
+        }
+
+      } else if (blockIdx.y == glwe_dimension) {
+
+        sample_extract_body<__uint128_t, params>(block_lwe_array_out,
+                                                 accumulator, 0);
+
+        if (num_many_lut > 1) {
+          for (int i = 1; i < num_many_lut; i++) {
+
+            auto next_lwe_array_out =
+                lwe_array_out +
+                (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+            auto next_block_lwe_array_out =
+                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                        (glwe_dimension * polynomial_size + 1) +
+                                    blockIdx.y * polynomial_size];
+
+            sample_extract_body<__uint128_t, params>(
+                next_block_lwe_array_out, accumulator, 0, i * lut_stride);
+          }
+        }
+      }
+    } else {
+      // Load the accumulator calculated in previous iterations
+      copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>(
+          accumulator, global_accumulator_slice);
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////
+/////////////////// END CG ///////////////////////////////////////
+
+template <typename InputTorus, class params>
+__host__ void execute_compute_keybundle_128(
+    cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
+  cuda_set_device(gpu_index);
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  uint32_t chunk_size =
+      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+
+  uint32_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2) * 4;
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+          __uint128_t>(polynomial_size);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  auto d_mem = buffer->d_mem_keybundle;
+  auto keybundle_fft = buffer->keybundle_fft;
+
+  // Compute a keybundle
+  dim3 grid_keybundle(num_samples * chunk_size,
+                      (glwe_dimension + 1) * (glwe_dimension + 1), level_count);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+  if (max_shared_memory < full_sm_keybundle)
+    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
+                                                          NOSM>
+        <<<grid_keybundle, thds, 0, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, full_sm_keybundle);
+  else
+    device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus, params,
+                                                          FULLSM>
+        <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            level_count, lwe_offset, chunk_size, keybundle_size_per_input,
+            d_mem, 0);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename InputTorus, class params, bool is_first_iter>
+__host__ void execute_step_one_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count) {
+  cuda_set_device(gpu_index);
+
+  uint64_t full_sm_accumulate_step_one =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
+          __uint128_t>(polynomial_size);
+  uint64_t partial_sm_accumulate_step_one =
+      get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+          __uint128_t>(polynomial_size);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  //
+  auto d_mem = buffer->d_mem_acc_step_one;
+  auto global_accumulator = buffer->global_accumulator;
+  auto global_accumulator_fft = buffer->global_join_buffer;
+
+  dim3 grid_accumulate_step_one(num_samples, glwe_dimension + 1, level_count);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  if (max_shared_memory < partial_sm_accumulate_step_one)
+    device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+        InputTorus, params, NOSM, is_first_iter>
+        <<<grid_accumulate_step_one, thds, 0, stream>>>(
+            lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
+            global_accumulator, global_accumulator_fft, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count, d_mem,
+            full_sm_accumulate_step_one);
+  else if (max_shared_memory < full_sm_accumulate_step_one)
+    device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+        InputTorus, params, PARTIALSM, is_first_iter>
+        <<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
+           stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
+                     lut_vector_indexes, global_accumulator,
+                     global_accumulator_fft, lwe_dimension, glwe_dimension,
+                     polynomial_size, base_log, level_count, d_mem,
+                     partial_sm_accumulate_step_one);
+  else
+    device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+        InputTorus, params, FULLSM, is_first_iter>
+        <<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
+           stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
+                     lut_vector_indexes, global_accumulator,
+                     global_accumulator_fft, lwe_dimension, glwe_dimension,
+                     polynomial_size, base_log, level_count, d_mem, 0);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename InputTorus, class params, bool is_last_iter>
+__host__ void execute_step_two_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t j, uint32_t num_many_lut, uint32_t lut_stride) {
+  cuda_set_device(gpu_index);
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  uint64_t full_sm_accumulate_step_two =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
+          __uint128_t>(polynomial_size);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  auto d_mem = buffer->d_mem_acc_step_two;
+  auto keybundle_fft = buffer->keybundle_fft;
+  auto global_accumulator = buffer->global_accumulator;
+  auto global_accumulator_fft = buffer->global_join_buffer;
+
+  dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  if (max_shared_memory < full_sm_accumulate_step_two)
+    device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+        InputTorus, params, NOSM, is_last_iter>
+        <<<grid_accumulate_step_two, thds, 0, stream>>>(
+            lwe_array_out, lwe_output_indexes, keybundle_fft,
+            global_accumulator, global_accumulator_fft, glwe_dimension,
+            polynomial_size, level_count, j, lwe_chunk_size, d_mem,
+            full_sm_accumulate_step_two, num_many_lut, lut_stride);
+  else
+    device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+        InputTorus, params, FULLSM, is_last_iter>
+        <<<grid_accumulate_step_two, thds, full_sm_accumulate_step_two,
+           stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
+                     global_accumulator, global_accumulator_fft, glwe_dimension,
+                     polynomial_size, level_count, j, lwe_chunk_size, d_mem, 0,
+                     num_many_lut, lut_stride);
+  check_cuda_error(cudaGetLastError());
+}
+
+/*
+ * Host wrapper to the multi-bit programmable bootstrap 128
+ */
+template <typename InputTorus, class params>
+__host__ void host_multi_bit_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle
+    execute_compute_keybundle_128<InputTorus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+    // Accumulate
+    uint32_t chunk_size = std::min(
+        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+    for (uint32_t j = 0; j < chunk_size; j++) {
+      bool is_first_iter = (j + lwe_offset) == 0;
+      bool is_last_iter =
+          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
+      if (is_first_iter) {
+        execute_step_one_128<InputTorus, params, true>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count);
+      } else {
+        execute_step_one_128<InputTorus, params, false>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count);
+      }
+
+      if (is_last_iter) {
+        execute_step_two_128<InputTorus, params, true>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      } else {
+        execute_step_two_128<InputTorus, params, false>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, glwe_dimension, polynomial_size, level_count, j,
+            num_many_lut, lut_stride);
+      }
+    }
+  }
+}
+
+template <typename InputTorus, class params>
+__host__ void execute_cg_external_product_loop_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t lwe_offset, uint32_t num_many_lut, uint32_t lut_stride) {
+  cuda_set_device(gpu_index);
+
+  const uint64_t full_sm =
+      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
+          polynomial_size);
+  const uint64_t partial_sm =
+      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
+          __uint128_t>(polynomial_size);
+
+  auto full_dm = full_sm;
+  auto partial_dm = full_sm - partial_sm;
+  uint64_t no_dm = 0;
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  uint32_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2) * 4;
+
+  uint32_t chunk_size =
+      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+
+  auto d_mem = buffer->d_mem_acc_cg;
+  auto keybundle_fft = buffer->keybundle_fft;
+  auto global_accumulator = buffer->global_accumulator;
+  auto join_buffer = buffer->global_join_buffer;
+
+  void *kernel_args[22];
+  kernel_args[0] = &lwe_array_out;
+  kernel_args[1] = &lwe_output_indexes;
+  kernel_args[2] = &lut_vector;
+  kernel_args[3] = &lut_vector_indexes;
+  kernel_args[4] = &lwe_array_in;
+  kernel_args[5] = &lwe_input_indexes;
+  kernel_args[6] = &keybundle_fft;
+  kernel_args[7] = &join_buffer;
+  kernel_args[8] = &global_accumulator;
+  kernel_args[9] = &lwe_dimension;
+  kernel_args[10] = &glwe_dimension;
+  kernel_args[11] = &polynomial_size;
+  kernel_args[12] = &base_log;
+  kernel_args[13] = &level_count;
+  kernel_args[14] = &grouping_factor;
+  kernel_args[15] = &lwe_offset;
+  kernel_args[16] = &chunk_size;
+  kernel_args[17] = &keybundle_size_per_input;
+  kernel_args[18] = &d_mem;
+  kernel_args[20] = &num_many_lut;
+  kernel_args[21] = &lut_stride;
+
+  dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  if (max_shared_memory < partial_dm) {
+    kernel_args[19] = &full_dm;
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, NOSM>,
+        grid_accumulate, thds, (void **)kernel_args, 0, stream));
+  } else if (max_shared_memory < full_dm) {
+    kernel_args[19] = &partial_dm;
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, PARTIALSM>,
+        grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
+  } else {
+    kernel_args[19] = &no_dm;
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, FULLSM>,
+        grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
+  }
+}
+
+template <typename InputTorus, class params>
+__host__ void host_cg_multi_bit_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  auto lwe_chunk_size = buffer->lwe_chunk_size;
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle
+    execute_compute_keybundle_128<InputTorus, params>(
+        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, level_count, lwe_offset);
+
+    // Accumulate
+    execute_cg_external_product_loop_128<InputTorus, params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
+        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
+        lut_stride);
+  }
+}
+
+template <typename InputTorus, typename params>
+__host__ uint64_t scratch_multi_bit_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  cuda_set_device(gpu_index);
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+          __uint128_t>(polynomial_size);
+  uint64_t full_sm_accumulate_step_one =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
+          __uint128_t>(polynomial_size);
+  uint64_t full_sm_accumulate_step_two =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
+          __uint128_t>(polynomial_size);
+  uint64_t partial_sm_accumulate_step_one =
+      get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+          __uint128_t>(polynomial_size);
+
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, NOSM>,
+        cudaFuncCachePreferShared));
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  if (max_shared_memory < partial_sm_accumulate_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, NOSM, false>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, NOSM, false>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory < full_sm_accumulate_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, PARTIALSM, false>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm_accumulate_step_one));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, PARTIALSM, false>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, PARTIALSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm_accumulate_step_one));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, PARTIALSM, true>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, FULLSM, false>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_accumulate_step_one));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, FULLSM, false>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_accumulate_step_one));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory < full_sm_accumulate_step_two) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, NOSM, false>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, NOSM, false>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, NOSM, true>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, FULLSM, false>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_accumulate_step_two));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, FULLSM, false>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_accumulate_step_two));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_accumulate_step_two_128<
+            InputTorus, params, FULLSM, true>,
+        cudaFuncCachePreferShared));
+    check_cuda_error(cudaGetLastError());
+  }
+
+  auto lwe_chunk_size = get_lwe_chunk_size_128<InputTorus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size,
+      full_sm_keybundle);
+  uint64_t size_tracker = 0;
+  *buffer = new pbs_buffer_128<InputTorus, MULTI_BIT>(
+      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
+      allocate_gpu_memory, &size_tracker);
+  return size_tracker;
+}
+
+template <typename InputTorus, typename params>
+__host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  cuda_set_device(gpu_index);
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+          __uint128_t>(polynomial_size);
+  uint64_t full_sm_cg_accumulate =
+      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
+          polynomial_size);
+  uint64_t partial_sm_cg_accumulate =
+      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
+          __uint128_t>(polynomial_size);
+
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle_128<InputTorus,
+                                                              params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory < partial_sm_cg_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<InputTorus,
+                                                                  params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<InputTorus,
+                                                                  params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory < full_sm_cg_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            InputTorus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  auto lwe_chunk_size = get_lwe_chunk_size_128<InputTorus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size,
+      full_sm_keybundle);
+  uint64_t size_tracker = 0;
+  *buffer = new pbs_buffer_128<InputTorus, MULTI_BIT>(
+      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
+      allocate_gpu_memory, &size_tracker);
+  return size_tracker;
+}
+
+#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index ea64a2889..582a0f1cb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -283,8 +283,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
     check_cuda_error(cudaGetLastError());
   }
 
-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
   uint64_t size_tracker = 0;
   *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
index 040d8631c..1e45784f7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -5,15 +5,15 @@
 #include <stdio.h>
 #include <type_traits>
 
-template <typename T> inline __device__ const char *get_format();
+template <typename T> __device__ inline const char *get_format();
 
-template <> inline __device__ const char *get_format<int>() { return "%d, "; }
+template <> __device__ inline const char *get_format<int>() { return "%d, "; }
 
-template <> inline __device__ const char *get_format<unsigned int>() {
+template <> __device__ inline const char *get_format<unsigned int>() {
   return "%u, ";
 }
 
-template <> inline __device__ const char *get_format<uint64_t>() {
+template <> __device__ inline const char *get_format<uint64_t>() {
   return "%lu, ";
 }
 
@@ -23,6 +23,15 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
   }
 }
 
+template <>
+__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
+  for (int i = 0; i < N; i++) {
+    uint64_t low = static_cast<uint64_t>(src[i]);
+    uint64_t high = static_cast<uint64_t>(src[i] >> 64);
+    printf("(%llu, %llu), ", high, low);
+  }
+}
+
 template <>
 __global__ inline void print_debug_kernel(const double2 *src, int N) {
   for (int i = 0; i < N; i++) {
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 2041d9235..27816bfa6 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2188,6 +2188,19 @@ unsafe extern "C" {
         grouping_factor: u32,
     );
 }
+unsafe extern "C" {
+    pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        dest: *mut ffi::c_void,
+        src: *const ffi::c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+    );
+}
 unsafe extern "C" {
     pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
         stream: *mut ffi::c_void,
@@ -2230,3 +2243,45 @@ unsafe extern "C" {
         pbs_buffer: *mut *mut i8,
     );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+    );
+}
diff --git a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
index 1b09ee34c..d55fb3b9c 100644
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -165,7 +165,7 @@ fn pbs_128(c: &mut Criterion) {
 mod cuda {
     use benchmark::utilities::{
         cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
-        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord,
+        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
         CudaLocalKeys, OperatorType,
     };
     use criterion::{black_box, Criterion, Throughput};
@@ -173,12 +173,14 @@ mod cuda {
     use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
     use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
     use tfhe::core_crypto::gpu::{
+        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext,
         cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams,
     };
     use tfhe::core_crypto::prelude::*;
     use tfhe::shortint::engine::ShortintEngine;
     use tfhe::shortint::parameters::{
         ModulusSwitchType, NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
         PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
     };
     use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
@@ -441,14 +443,281 @@ mod cuda {
         );
     }
 
+    fn cuda_multi_bit_pbs_128(c: &mut Criterion) {
+        let bench_name = "core_crypto::cuda::multi_bit_pbs128";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(10)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        type Scalar = u128;
+        let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let squash_params =
+            NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+        let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
+        let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
+
+        let params_name = "PARAMS_SWITCH_SQUASH";
+
+        let mut boxed_seeder = new_seeder();
+        let seeder = boxed_seeder.as_mut();
+
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+
+        let input_lwe_secret_key =
+            LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);
+
+        let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
+            squash_params.glwe_dimension,
+            squash_params.polynomial_size,
+            &mut secret_generator,
+        );
+
+        let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+
+        let multi_bit_bsk = LweMultiBitBootstrapKey::new(
+            Scalar::ZERO,
+            squash_params.glwe_dimension.to_glwe_size(),
+            squash_params.polynomial_size,
+            squash_params.decomp_base_log,
+            squash_params.decomp_level_count,
+            input_params.lwe_dimension,
+            squash_params.grouping_factor,
+            squash_params.ciphertext_modulus,
+        );
+
+        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            .multi_bit_bootstrap_key(multi_bit_bsk)
+            .build();
+
+        let message_modulus: u64 = 1 << 4;
+        let input_message: u64 = 3;
+        let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
+        let plaintext = Plaintext(input_message * delta);
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let streams = CudaStreams::new_multi_gpu();
+                let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
+
+                let lwe_ciphertext_in: LweCiphertextOwned<u64> =
+                    allocate_and_encrypt_new_lwe_ciphertext(
+                        &input_lwe_secret_key,
+                        plaintext,
+                        lwe_noise_distribution_u64,
+                        ct_modulus_u64,
+                        &mut encryption_generator,
+                    );
+                let lwe_ciphertext_in_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams);
+
+                let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
+                    Scalar::ONE,
+                    squash_params.glwe_dimension.to_glwe_size(),
+                    squash_params.polynomial_size,
+                    squash_params.ciphertext_modulus,
+                );
+                let accumulator_gpu =
+                    CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
+
+                let out_pbs_ct = LweCiphertext::new(
+                    Scalar::ZERO,
+                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                    squash_params.ciphertext_modulus,
+                );
+                let mut out_pbs_ct_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
+
+                let h_indexes = [0];
+                let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
+
+                bench_id = format!("{bench_name}::{params_name}");
+                {
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                                &lwe_ciphertext_in_gpu,
+                                &mut out_pbs_ct_gpu,
+                                &accumulator_gpu,
+                                &cuda_indexes.d_lut,
+                                &cuda_indexes.d_output,
+                                &cuda_indexes.d_input,
+                                gpu_keys.multi_bit_bsk.as_ref().unwrap(),
+                                &streams,
+                            );
+                            black_box(&mut out_pbs_ct_gpu);
+                        })
+                    });
+                }
+            }
+            BenchmarkType::Throughput => {
+                let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
+                let gpu_count = get_number_of_gpus() as usize;
+
+                bench_id = format!("{bench_name}::throughput::{params_name}");
+                let blocks: usize = 1;
+                let elements = throughput_num_threads(blocks, 1);
+                let elements_per_stream = elements as usize / gpu_count;
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let local_streams = cuda_local_streams_core();
+
+                        let plaintext_list =
+                            PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));
+
+                        let input_cts = (0..gpu_count)
+                            .map(|i| {
+                                let mut input_ct_list = LweCiphertextList::new(
+                                    u64::ZERO,
+                                    input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    ct_modulus_u64,
+                                );
+
+                                encrypt_lwe_ciphertext_list(
+                                    &input_lwe_secret_key,
+                                    &mut input_ct_list,
+                                    &plaintext_list,
+                                    lwe_noise_distribution_u64,
+                                    &mut encryption_generator,
+                                );
+
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &input_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let accumulators = (0..gpu_count)
+                            .map(|i| {
+                                let accumulator = GlweCiphertextOwned::new(
+                                    Scalar::ONE,
+                                    squash_params.glwe_dimension.to_glwe_size(),
+                                    squash_params.polynomial_size,
+                                    squash_params.ciphertext_modulus,
+                                );
+                                CudaGlweCiphertextList::from_glwe_ciphertext(
+                                    &accumulator,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        // Allocate the LweCiphertext to store the result of the PBS
+                        let output_cts = (0..gpu_count)
+                            .map(|i| {
+                                let output_ct_list = LweCiphertextList::new(
+                                    Scalar::ZERO,
+                                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    squash_params.ciphertext_modulus,
+                                );
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &output_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let h_indexes = (0..(elements / gpu_count as u64))
+                            .map(CastFrom::cast_from)
+                            .collect::<Vec<_>>();
+                        let cuda_indexes_vec = (0..gpu_count)
+                            .map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
+                            .collect::<Vec<_>>();
+                        local_streams.iter().for_each(|stream| stream.synchronize());
+
+                        (
+                            input_cts,
+                            output_cts,
+                            accumulators,
+                            cuda_indexes_vec,
+                            local_streams,
+                        )
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(
+                            input_cts,
+                            mut output_cts,
+                            accumulators,
+                            cuda_indexes_vec,
+                            local_streams,
+                        )| {
+                            (0..gpu_count)
+                                .into_par_iter()
+                                .zip(input_cts.par_iter())
+                                .zip(output_cts.par_iter_mut())
+                                .zip(accumulators.par_iter())
+                                .zip(local_streams.par_iter())
+                                .for_each(
+                                    |((((i, input_ct), output_ct), accumulator), local_stream)| {
+                                        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                                            input_ct,
+                                            output_ct,
+                                            accumulator,
+                                            &cuda_indexes_vec[i].d_lut,
+                                            &cuda_indexes_vec[i].d_output,
+                                            &cuda_indexes_vec[i].d_input,
+                                            gpu_keys_vec[i].multi_bit_bsk.as_ref().unwrap(),
+                                            local_stream,
+                                        );
+                                    },
+                                )
+                        },
+                        criterion::BatchSize::SmallInput,
+                    );
+                });
+            }
+        };
+
+        let params_record = CryptoParametersRecord {
+            lwe_dimension: Some(input_params.lwe_dimension),
+            glwe_dimension: Some(squash_params.glwe_dimension),
+            polynomial_size: Some(squash_params.polynomial_size),
+            lwe_noise_distribution: Some(lwe_noise_distribution_u64),
+            glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
+            pbs_base_log: Some(squash_params.decomp_base_log),
+            pbs_level: Some(squash_params.decomp_level_count),
+            ciphertext_modulus: Some(input_params.ciphertext_modulus),
+            ..Default::default()
+        };
+
+        let bit_size = (message_modulus as u32).ilog2();
+        write_to_json(
+            &bench_id,
+            params_record,
+            params_name,
+            "pbs",
+            &OperatorType::Atomic,
+            bit_size,
+            vec![bit_size],
+        );
+    }
+
     pub fn cuda_pbs128_group() {
         let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
         cuda_pbs_128(&mut criterion);
     }
+
+    pub fn cuda_multi_bit_pbs128_group() {
+        let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
+        cuda_multi_bit_pbs_128(&mut criterion);
+    }
 }
 
 #[cfg(feature = "gpu")]
-use cuda::cuda_pbs128_group;
+use cuda::{cuda_multi_bit_pbs128_group, cuda_pbs128_group};
 
 pub fn pbs128_group() {
     let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
@@ -458,6 +727,7 @@ pub fn pbs128_group() {
 #[cfg(feature = "gpu")]
 fn go_through_gpu_bench_groups() {
     cuda_pbs128_group();
+    cuda_multi_bit_pbs128_group();
 }
 
 #[cfg(not(feature = "gpu"))]
diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs
index 4330e07ad..873740311 100644
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -521,7 +521,7 @@ mod cuda_utils {
         pub ksk: Option<CudaLweKeyswitchKey<T>>,
         pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
         pub bsk: Option<CudaLweBootstrapKey>,
-        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey>,
+        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
     }
 
     #[allow(dead_code)]
diff --git a/tfhe/examples/utilities/params_to_file.rs b/tfhe/examples/utilities/params_to_file.rs
index 8fa1b9fa9..9cfdd4347 100644
--- a/tfhe/examples/utilities/params_to_file.rs
+++ b/tfhe/examples/utilities/params_to_file.rs
@@ -13,6 +13,8 @@ use tfhe::shortint::parameters::current_params::{
     VEC_ALL_COMPRESSION_PARAMETERS, VEC_ALL_HPU_PARAMETERS, VEC_ALL_KS32_PARAMETERS,
     VEC_ALL_MULTI_BIT_PBS_PARAMETERS, VEC_ALL_NOISE_SQUASHING_PARAMETERS,
 };
+use tfhe::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
+use tfhe::shortint::parameters::v1_3::VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS;
 use tfhe::shortint::parameters::{
     CompactPublicKeyEncryptionParameters, CompressionParameters, NoiseSquashingParameters,
 };
@@ -214,6 +216,36 @@ impl ParamDetails<u128> for NoiseSquashingParameters {
     }
 }
 
+impl ParamDetails<u128> for NoiseSquashingMultiBitParameters {
+    fn lwe_dimension(&self) -> LweDimension {
+        panic!("lwe_dimension not applicable for NoiseSquashingMultiBitParameters")
+    }
+
+    fn glwe_dimension(&self) -> GlweDimension {
+        self.glwe_dimension
+    }
+
+    fn lwe_noise_distribution(&self) -> DynamicDistribution<u128> {
+        panic!("lwe_noise_distribution not applicable for NoiseSquashingMultiBitParameters")
+    }
+
+    fn glwe_noise_distribution(&self) -> DynamicDistribution<u128> {
+        self.glwe_noise_distribution
+    }
+
+    fn polynomial_size(&self) -> PolynomialSize {
+        self.polynomial_size
+    }
+
+    fn lwe_ciphertext_modulus(&self) -> ParamModulus {
+        panic!("lwe_ciphertext_modulus not applicable for NoiseSquashingMultiBitParameters")
+    }
+
+    fn glwe_ciphertext_modulus(&self) -> ParamModulus {
+        ParamModulus::from_ciphertext_modulus(self.ciphertext_modulus)
+    }
+}
+
 #[derive(Eq, PartialEq, Hash)]
 enum ParametersFormat {
     Lwe,
@@ -493,6 +525,16 @@ fn main() {
         ParametersFormat::Glwe,
     );
 
+    let noise_squasing_multi_bit_params: Vec<_> = VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS
+        .into_iter()
+        .map(|p| (*p.0, Some(p.1)))
+        .collect();
+    write_all_params_in_file(
+        "shortint_noise_squashing_multi_bit_parameters_lattice_estimator.sage",
+        &noise_squasing_multi_bit_params,
+        ParametersFormat::Glwe,
+    );
+
     let ks32_params: Vec<_> = VEC_ALL_KS32_PARAMETERS
         .into_iter()
         .map(|p| (AtomicPatternParameters::from(*p.0), Some(p.1)))
diff --git a/tfhe/src/core_crypto/algorithms/test/mod.rs b/tfhe/src/core_crypto/algorithms/test/mod.rs
index 5df7246d6..6df987a47 100644
--- a/tfhe/src/core_crypto/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/algorithms/test/mod.rs
@@ -178,6 +178,36 @@ pub const DUMMY_31_U32: ClassicTestParams<u32> = ClassicTestParams {
     ciphertext_modulus: CiphertextModulus::new(1 << 31),
 };
 
+#[cfg(feature = "gpu")]
+pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+    NoiseSquashingMultiBitTestParameters<u128> = NoiseSquashingMultiBitTestParameters {
+    glwe_dimension: GlweDimension(2),
+    polynomial_size: PolynomialSize(2048),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
+    decomp_base_log: DecompositionBaseLog(23),
+    decomp_level_count: DecompositionLevelCount(3),
+    grouping_factor: LweBskGroupingFactor(4),
+    message_modulus_log: MessageModulusLog(4),
+    ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
+};
+
+#[cfg(feature = "gpu")]
+pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: MultiBitTestParams<
+    u64,
+> = MultiBitTestParams {
+    input_lwe_dimension: LweDimension(920),
+    lwe_noise_distribution: DynamicDistribution::new_t_uniform(45),
+    decomp_base_log: DecompositionBaseLog(22),
+    decomp_level_count: DecompositionLevelCount(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(2048),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(17),
+    message_modulus_log: MessageModulusLog(4),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+    grouping_factor: LweBskGroupingFactor(4),
+    thread_count: ThreadCount(5),
+};
+
 pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
     input_lwe_dimension: LweDimension(818),
     lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
diff --git a/tfhe/src/core_crypto/algorithms/test/params.rs b/tfhe/src/core_crypto/algorithms/test/params.rs
index cbdb90223..89e18632f 100644
--- a/tfhe/src/core_crypto/algorithms/test/params.rs
+++ b/tfhe/src/core_crypto/algorithms/test/params.rs
@@ -3,6 +3,8 @@ use crate::core_crypto::entities::*;
 use crate::core_crypto::prelude::{CastFrom, CastInto, UnsignedInteger};
 use crate::keycache::NamedParam;
 #[cfg(feature = "gpu")]
+use crate::shortint::parameters::ModulusSwitchNoiseReductionParams;
+#[cfg(feature = "gpu")]
 use crate::shortint::parameters::ModulusSwitchType;
 use serde::{Deserialize, Serialize};
 
@@ -22,6 +24,13 @@ pub struct MultiBitBootstrapKeys<Scalar: UnsignedInteger> {
     pub fbsk: FourierLweMultiBitBootstrapKeyOwned,
 }
 
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub struct MultiBitStdBootstrapKeys<Scalar: UnsignedInteger> {
+    pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub bsk: LweMultiBitBootstrapKeyOwned<Scalar>,
+}
+
 // Fourier key is generated afterward in order to use generic test function
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub struct FftBootstrapKeys<Scalar: UnsignedInteger> {
@@ -81,6 +90,18 @@ pub struct MultiBitTestParams<Scalar: UnsignedInteger> {
     pub thread_count: ThreadCount,
 }
 
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub struct NoiseSquashingMultiBitTestParameters<Scalar: UnsignedInteger> {
+    pub glwe_dimension: GlweDimension,
+    pub polynomial_size: PolynomialSize,
+    pub glwe_noise_distribution: DynamicDistribution<Scalar>,
+    pub decomp_base_log: DecompositionBaseLog,
+    pub decomp_level_count: DecompositionLevelCount,
+    pub grouping_factor: LweBskGroupingFactor,
+    pub message_modulus_log: MessageModulusLog,
+    pub ciphertext_modulus: CiphertextModulus<Scalar>,
+}
+
 // PartialEq is implemented manually because thread_count doesn't affect key generation and we want
 // to change its value in test without the need of regenerating keys in the key cache.
 impl<Scalar: UnsignedInteger> PartialEq for MultiBitTestParams<Scalar> {
@@ -141,6 +162,21 @@ pub struct NoiseSquashingTestParams<Scalar: UnsignedInteger> {
     pub modulus_switch_noise_reduction_params: ModulusSwitchType,
     pub ciphertext_modulus: CiphertextModulus<Scalar>,
 }
+// Parameters to test NoiseSquashing implementation
+#[cfg(feature = "gpu")]
+#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
+pub struct NoiseSquashingMultiBitTestParams<Scalar: UnsignedInteger> {
+    pub lwe_dimension: LweDimension,
+    pub glwe_dimension: GlweDimension,
+    pub polynomial_size: PolynomialSize,
+    pub lwe_noise_distribution: DynamicDistribution<Scalar>,
+    pub glwe_noise_distribution: DynamicDistribution<Scalar>,
+    pub pbs_base_log: DecompositionBaseLog,
+    pub pbs_level: DecompositionLevelCount,
+    pub grouping_factor: LweBskGroupingFactor,
+    pub modulus_switch_noise_reduction_params: Option<ModulusSwitchNoiseReductionParams>,
+    pub ciphertext_modulus: CiphertextModulus<Scalar>,
+}
 
 #[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct PackingKeySwitchTestParams<Scalar: UnsignedInteger> {
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
index 775cce447..326ad192c 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
@@ -17,7 +17,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
     lut_indexes: &CudaVec<Scalar>,
     output_indexes: &CudaVec<Scalar>,
     input_indexes: &CudaVec<Scalar>,
-    multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
     streams: &CudaStreams,
 ) where
     // CastInto required for PBS modulus switch which returns a usize
@@ -151,7 +151,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
     lut_indexes: &CudaVec<Scalar>,
     output_indexes: &CudaVec<Scalar>,
     input_indexes: &CudaVec<Scalar>,
-    multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
     streams: &CudaStreams,
 ) where
     // CastInto required for PBS modulus switch which returns a usize
@@ -171,3 +171,162 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
     }
     streams.synchronize();
 }
+
+/// # Safety
+///
+/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until streams is synchronised
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async<OutputScalar>(
+    input: &CudaLweCiphertextList<u64>,
+    output: &mut CudaLweCiphertextList<OutputScalar>,
+    accumulator: &CudaGlweCiphertextList<OutputScalar>,
+    lut_indexes: &CudaVec<u64>,
+    output_indexes: &CudaVec<u64>,
+    input_indexes: &CudaVec<u64>,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<OutputScalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    OutputScalar: UnsignedTorus + CastInto<usize>,
+{
+    assert_eq!(
+        input.lwe_dimension(),
+        multi_bit_bsk.input_lwe_dimension(),
+        "Mismatched input LweDimension. LweCiphertext input LweDimension {:?}. \
+        FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
+        input.lwe_dimension(),
+        multi_bit_bsk.input_lwe_dimension(),
+    );
+
+    assert_eq!(
+        output.lwe_dimension(),
+        multi_bit_bsk.output_lwe_dimension(),
+        "Mismatched output LweDimension. LweCiphertext output LweDimension {:?}. \
+        FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
+        output.lwe_dimension(),
+        multi_bit_bsk.output_lwe_dimension(),
+    );
+
+    assert_eq!(
+        accumulator.glwe_dimension(),
+        multi_bit_bsk.glwe_dimension(),
+        "Mismatched GlweSize. Accumulator GlweSize {:?}. \
+        FourierLweMultiBitBootstrapKey GlweSize {:?}.",
+        accumulator.glwe_dimension(),
+        multi_bit_bsk.glwe_dimension(),
+    );
+
+    assert_eq!(
+        accumulator.polynomial_size(),
+        multi_bit_bsk.polynomial_size(),
+        "Mismatched PolynomialSize. Accumulator PolynomialSize {:?}. \
+        FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
+        accumulator.polynomial_size(),
+        multi_bit_bsk.polynomial_size(),
+    );
+
+    assert_eq!(
+        output.ciphertext_modulus(),
+        accumulator.ciphertext_modulus(),
+        "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
+        input.ciphertext_modulus(),
+        accumulator.ciphertext_modulus(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        multi_bit_bsk.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        multi_bit_bsk.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        accumulator.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first accumulator pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        accumulator.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input_indexes.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output_indexes.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lut_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first lut indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        lut_indexes.gpu_index(0).get(),
+    );
+
+    programmable_bootstrap_multi_bit_async(
+        streams,
+        &mut output.0.d_vec,
+        output_indexes,
+        &accumulator.0.d_vec,
+        lut_indexes,
+        &input.0.d_vec,
+        input_indexes,
+        &multi_bit_bsk.d_vec,
+        input.lwe_dimension(),
+        multi_bit_bsk.glwe_dimension(),
+        multi_bit_bsk.polynomial_size(),
+        multi_bit_bsk.decomp_base_log(),
+        multi_bit_bsk.decomp_level_count(),
+        multi_bit_bsk.grouping_factor(),
+        input.lwe_ciphertext_count().0 as u32,
+    );
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext<Scalar>(
+    input: &CudaLweCiphertextList<u64>,
+    output: &mut CudaLweCiphertextList<Scalar>,
+    accumulator: &CudaGlweCiphertextList<Scalar>,
+    lut_indexes: &CudaVec<u64>,
+    output_indexes: &CudaVec<u64>,
+    input_indexes: &CudaVec<u64>,
+    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    unsafe {
+        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async(
+            input,
+            output,
+            accumulator,
+            lut_indexes,
+            output_indexes,
+            input_indexes,
+            multi_bit_bsk,
+            streams,
+        );
+    }
+    streams.synchronize();
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs
new file mode 100644
index 000000000..e329d8d5c
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs
@@ -0,0 +1,187 @@
+use super::*;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
+use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
+use crate::core_crypto::gpu::{
+    cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext, CudaStreams,
+};
+use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
+use itertools::Itertools;
+
+fn execute_multibit_bootstrap_u128(
+    squash_params: NoiseSquashingMultiBitTestParameters<u128>,
+    input_params: MultiBitTestParams<u64>,
+) {
+    let input_lwe_dimension = input_params.input_lwe_dimension;
+    let lwe_noise_distribution = input_params.lwe_noise_distribution;
+    let glwe_noise_distribution = squash_params.glwe_noise_distribution;
+    let ciphertext_modulus = squash_params.ciphertext_modulus;
+    let ciphertext_modulus_64 = CiphertextModulus::new_native();
+    let msg_modulus = input_params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+    let encoding_with_padding_64: u64 = get_encoding_with_padding(ciphertext_modulus_64);
+    let glwe_dimension = squash_params.glwe_dimension;
+    let polynomial_size = squash_params.polynomial_size;
+    let decomp_base_log = squash_params.decomp_base_log;
+    let decomp_level_count = squash_params.decomp_level_count;
+    let grouping_factor = squash_params.grouping_factor;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let mut rsc = TestResources::new();
+
+    let f = |x: u128| x % msg_modulus.0 as u128;
+
+    let delta = encoding_with_padding / msg_modulus.0 as u128;
+    let delta_64 = encoding_with_padding_64 / msg_modulus.0 as u64;
+    let mut msg = msg_modulus.0 as u64;
+    const NB_TESTS: usize = 10;
+    let number_of_messages = 1;
+
+    let accumulator = generate_programmable_bootstrap_glwe_lut(
+        polynomial_size,
+        glwe_dimension.to_glwe_size(),
+        msg_modulus.0.cast_into(),
+        ciphertext_modulus,
+        delta,
+        f,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &accumulator,
+        ciphertext_modulus
+    ));
+
+    // Create the LweSecretKey
+    let small_lwe_sk: LweSecretKeyOwned<u128> = allocate_and_generate_new_binary_lwe_secret_key(
+        input_lwe_dimension,
+        &mut rsc.secret_random_generator,
+    );
+    let input_lwe_secret_key = LweSecretKey::from_container(
+        small_lwe_sk
+            .clone()
+            .into_container()
+            .iter()
+            .copied()
+            .map(|x| x as u64)
+            .collect::<Vec<_>>(),
+    );
+    let output_glwe_secret_key: GlweSecretKeyOwned<u128> =
+        allocate_and_generate_new_binary_glwe_secret_key(
+            glwe_dimension,
+            polynomial_size,
+            &mut rsc.secret_random_generator,
+        );
+    let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+    let output_lwe_dimension = output_lwe_secret_key.lwe_dimension();
+
+    let mut bsk = LweMultiBitBootstrapKey::new(
+        0u128,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        decomp_base_log,
+        decomp_level_count,
+        input_lwe_dimension,
+        grouping_factor,
+        ciphertext_modulus,
+    );
+
+    par_generate_lwe_multi_bit_bootstrap_key(
+        &small_lwe_sk,
+        &output_glwe_secret_key,
+        &mut bsk,
+        glwe_noise_distribution,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &*bsk,
+        ciphertext_modulus
+    ));
+
+    let d_bsk = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(&bsk, &stream);
+
+    while msg != 0 {
+        msg -= 1;
+        for _ in 0..NB_TESTS {
+            let plaintext = Plaintext(msg * delta_64);
+
+            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
+                &input_lwe_secret_key,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus_64,
+                &mut rsc.encryption_random_generator,
+            );
+
+            assert!(check_encrypted_content_respects_mod(
+                &lwe_ciphertext_in,
+                ciphertext_modulus_64
+            ));
+
+            let d_lwe_ciphertext_in =
+                CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+            let mut d_out_pbs_ct = CudaLweCiphertextList::new(
+                output_lwe_dimension,
+                LweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+            let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            let mut test_vector_indexes: Vec<u64> = vec![0; number_of_messages];
+            for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
+                *ind = <usize as CastInto<u64>>::cast_into(i);
+            }
+
+            let mut d_test_vector_indexes =
+                unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
+            unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
+
+            let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
+            let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
+            let lwe_indexes = lwe_indexes_usize
+                .iter()
+                .map(|&x| <usize as CastInto<u64>>::cast_into(x))
+                .collect_vec();
+            let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
+            let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
+            unsafe {
+                d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+                d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+            }
+
+            cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                &d_lwe_ciphertext_in,
+                &mut d_out_pbs_ct,
+                &d_accumulator,
+                &d_test_vector_indexes,
+                &d_output_indexes,
+                &d_input_indexes,
+                &d_bsk,
+                &stream,
+            );
+
+            let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
+            assert!(check_encrypted_content_respects_mod(
+                &out_pbs_ct,
+                ciphertext_modulus
+            ));
+
+            let decrypted = decrypt_lwe_ciphertext(&output_lwe_secret_key, &out_pbs_ct);
+
+            let decoded = round_decode(decrypted.0, delta) % msg_modulus.0 as u128;
+            assert_eq!(decoded, f(msg as u128));
+        }
+    }
+}
+
+#[test]
+fn test_multibit_bootstrap_u128_with_squashing() {
+    execute_multibit_bootstrap_u128(
+        NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    );
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
index 9847661ad..d8d986245 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -7,11 +7,13 @@ mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
+mod lwe_multi_bit_programmable_bootstrapping_128;
 mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
 mod lwe_programmable_bootstrapping_128;
 mod modulus_switch_noise_reduction;
 mod noise_distribution;
+
 pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
     pub lwe_sk: LweSecretKey<Vec<Scalar>>,
     pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs
index 3d23c68ea..0ad4bc183 100644
--- a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs
@@ -10,9 +10,9 @@ use crate::core_crypto::prelude::{
 
 /// A structure representing a vector of GLWE ciphertexts with 64 bits of precision on the GPU.
 #[derive(Debug)]
-pub struct CudaLweMultiBitBootstrapKey {
+pub struct CudaLweMultiBitBootstrapKey<Scalar: UnsignedInteger> {
     // Pointers to GPU data
-    pub(crate) d_vec: CudaVec<u64>,
+    pub(crate) d_vec: CudaVec<Scalar>,
     // Lwe dimension
     pub(crate) input_lwe_dimension: LweDimension,
     // Glwe dimension
@@ -27,14 +27,11 @@ pub struct CudaLweMultiBitBootstrapKey {
     pub(crate) grouping_factor: LweBskGroupingFactor,
 }
 
-impl CudaLweMultiBitBootstrapKey {
-    pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container>(
+impl<Scalar: UnsignedInteger> CudaLweMultiBitBootstrapKey<Scalar> {
+    pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container<Element = Scalar>>(
         bsk: &LweMultiBitBootstrapKey<InputBskCont>,
         streams: &CudaStreams,
-    ) -> Self
-    where
-        InputBskCont::Element: UnsignedInteger,
-    {
+    ) -> Self {
         let input_lwe_dimension = bsk.input_lwe_dimension();
         let polynomial_size = bsk.polynomial_size();
         let decomp_level_count = bsk.decomposition_level_count();
@@ -43,7 +40,7 @@ impl CudaLweMultiBitBootstrapKey {
         let grouping_factor = bsk.grouping_factor();
 
         // Allocate memory
-        let mut d_vec = CudaVec::<u64>::new_multi_gpu(
+        let mut d_vec = CudaVec::<InputBskCont::Element>::new_multi_gpu(
             lwe_multi_bit_bootstrap_key_size(
                 input_lwe_dimension,
                 glwe_dimension.to_glwe_size(),
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index ff119c24b..1803a11d8 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -14,9 +14,11 @@ use crate::core_crypto::prelude::{
 };
 pub use algorithms::*;
 pub use entities::*;
+use std::any::{Any, TypeId};
 use std::ffi::c_void;
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
+
 pub struct CudaStreams {
     pub ptr: Vec<*mut c_void>,
     pub gpu_indexes: Vec<GpuIndex>,
@@ -311,15 +313,18 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
+pub unsafe fn programmable_bootstrap_multi_bit_async<
+    T: UnsignedInteger,
+    B: Any + UnsignedInteger,
+>(
     streams: &CudaStreams,
-    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_out: &mut CudaVec<B>,
     output_indexes: &CudaVec<T>,
-    test_vector: &CudaVec<T>,
+    test_vector: &CudaVec<B>,
     test_vector_indexes: &CudaVec<T>,
     lwe_array_in: &CudaVec<T>,
     input_indexes: &CudaVec<T>,
-    bootstrapping_key: &CudaVec<u64>,
+    bootstrapping_key: &CudaVec<B>,
     lwe_dimension: LweDimension,
     glwe_dimension: GlweDimension,
     polynomial_size: PolynomialSize,
@@ -331,42 +336,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
     let num_many_lut = 1u32;
     let lut_stride = 0u32;
     let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_multi_bit_programmable_bootstrap_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        level.0 as u32,
-        num_samples,
-        true,
-    );
-    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        output_indexes.as_c_ptr(0),
-        test_vector.as_c_ptr(0),
-        test_vector_indexes.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        input_indexes.as_c_ptr(0),
-        bootstrapping_key.as_c_ptr(0),
-        pbs_buffer,
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        grouping_factor.0 as u32,
-        base_log.0 as u32,
-        level.0 as u32,
-        num_samples,
-        num_many_lut,
-        lut_stride,
-    );
-    cleanup_cuda_multi_bit_programmable_bootstrap(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-    );
+    if TypeId::of::<B>() == TypeId::of::<u128>() {
+        scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            output_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            input_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            grouping_factor.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+        cleanup_cuda_multi_bit_programmable_bootstrap_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else if TypeId::of::<B>() == TypeId::of::<u64>() {
+        scratch_cuda_multi_bit_programmable_bootstrap_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            output_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            input_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            grouping_factor.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+        cleanup_cuda_multi_bit_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else {
+        panic!("Unsupported torus size")
+    }
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -607,9 +653,9 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: UnsignedInteger>(
+pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Any + UnsignedInteger>(
     streams: &CudaStreams,
-    dest: &mut CudaVec<u64>,
+    dest: &mut CudaVec<T>,
     src: &[T],
     input_lwe_dim: LweDimension,
     glwe_dim: GlweDimension,
@@ -620,17 +666,34 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
     let size = std::mem::size_of_val(src);
     for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
         assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
-        cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-            stream_ptr,
-            streams.gpu_indexes[i].get(),
-            dest.as_mut_c_ptr(i as u32),
-            src.as_ptr().cast(),
-            input_lwe_dim.0 as u32,
-            glwe_dim.0 as u32,
-            l_gadget.0 as u32,
-            polynomial_size.0 as u32,
-            grouping_factor.0 as u32,
-        );
+
+        if TypeId::of::<T>() == TypeId::of::<u128>() {
+            cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+            );
+        } else if TypeId::of::<T>() == TypeId::of::<u64>() {
+            cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+            );
+        } else {
+            panic!("Unsupported torus size for bsk conversion")
+        }
     }
 }
 
diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs
index 18d1acfd5..244d12706 100644
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -30,7 +30,7 @@ pub struct CudaCompressionKey {
 }
 
 pub struct CudaDecompressionKey {
-    pub blind_rotate_key: CudaBootstrappingKey,
+    pub blind_rotate_key: CudaBootstrappingKey<u64>,
     pub lwe_per_glwe: LweCiphertextCount,
     pub glwe_dimension: GlweDimension,
     pub polynomial_size: PolynomialSize,
diff --git a/tfhe/src/integer/gpu/server_key/mod.rs b/tfhe/src/integer/gpu/server_key/mod.rs
index 6d37feac0..4920e2ed1 100644
--- a/tfhe/src/integer/gpu/server_key/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/mod.rs
@@ -22,9 +22,9 @@ use crate::shortint::{CarryModulus, CiphertextModulus, MessageModulus, PBSOrder}
 
 mod radix;
 
-pub enum CudaBootstrappingKey {
+pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
     Classic(CudaLweBootstrapKey),
-    MultiBit(CudaLweMultiBitBootstrapKey),
+    MultiBit(CudaLweMultiBitBootstrapKey<Scalar>),
 }
 
 /// A structure containing the server public key.
@@ -34,7 +34,7 @@ pub enum CudaBootstrappingKey {
 // #[derive(PartialEq, Serialize, Deserialize)]
 pub struct CudaServerKey {
     pub key_switching_key: CudaLweKeyswitchKey<u64>,
-    pub bootstrapping_key: CudaBootstrappingKey,
+    pub bootstrapping_key: CudaBootstrappingKey<u64>,
     // Size of the message buffer
     pub message_modulus: MessageModulus,
     // Size of the carry buffer
diff --git a/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs b/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs
index 7a05295e7..3cd79fa62 100644
--- a/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs
+++ b/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs
@@ -1,6 +1,6 @@
 use crate::core_crypto::prelude::*;
 use crate::shortint::parameters::noise_squashing::{
-    NoiseSquashingCompressionParameters, NoiseSquashingParameters,
+    NoiseSquashingCompressionParameters, NoiseSquashingMultiBitParameters, NoiseSquashingParameters,
 };
 use crate::shortint::parameters::{
     CoreCiphertextModulus, ModulusSwitchNoiseReductionParams, ModulusSwitchType,
@@ -69,3 +69,8 @@ pub enum NoiseSquashingParametersVersions {
 pub enum NoiseSquashingCompressionParametersVersions {
     V0(NoiseSquashingCompressionParameters),
 }
+
+#[derive(VersionsDispatch)]
+pub enum NoiseSquashingMultiBitParametersVersions {
+    V0(NoiseSquashingMultiBitParameters),
+}
diff --git a/tfhe/src/shortint/keycache.rs b/tfhe/src/shortint/keycache.rs
index 770de5e5d..7513a7288 100644
--- a/tfhe/src/shortint/keycache.rs
+++ b/tfhe/src/shortint/keycache.rs
@@ -1,18 +1,19 @@
 use std::sync::LazyLock;
 
+use super::atomic_pattern::AtomicPatternParameters;
 use crate::keycache::utils::named_params_impl;
 use crate::keycache::*;
 #[cfg(tarpaulin)]
 use crate::shortint::parameters::coverage_parameters::*;
 use crate::shortint::parameters::current_params::*;
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
 use crate::shortint::parameters::parameters_wopbs::*;
+use crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 use crate::shortint::parameters::*;
 use crate::shortint::wopbs::WopbsKey;
 use crate::shortint::{ClientKey, KeySwitchingKey, ServerKey};
 use serde::{Deserialize, Serialize};
 
-use super::atomic_pattern::AtomicPatternParameters;
-
 named_params_impl!( ShortintParameterSet =>
     V1_3_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M128,
     V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
@@ -492,6 +493,10 @@ named_params_impl!( NoiseSquashingParameters =>
     V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 );
 
+named_params_impl!( NoiseSquashingMultiBitParameters =>
+    V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+);
+
 impl From<AtomicPatternParameters> for (ClientKey, ServerKey) {
     fn from(param: AtomicPatternParameters) -> Self {
         let param_set = ShortintParameterSet::from(param);
diff --git a/tfhe/src/shortint/parameters/aliases.rs b/tfhe/src/shortint/parameters/aliases.rs
index 42740f460..802ade433 100644
--- a/tfhe/src/shortint/parameters/aliases.rs
+++ b/tfhe/src/shortint/parameters/aliases.rs
@@ -46,8 +46,13 @@ use current_params::multi_bit::tuniform::p_fail_2_minus_64::ks_pbs_gpu::{
 };
 use current_params::noise_squashing::p_fail_2_minus_128::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 
-use super::current_params::V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use super::current_params::{
+    V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
 use super::NoiseSquashingCompressionParameters;
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
+
 // Aliases
 
 // Compute Gaussian
@@ -125,6 +130,10 @@ pub const NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
     NoiseSquashingParameters =
     V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 
+pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+    NoiseSquashingMultiBitParameters =
+    V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
 // GPU 2^-64
 // GPU TUniform
 pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M64:
diff --git a/tfhe/src/shortint/parameters/noise_squashing.rs b/tfhe/src/shortint/parameters/noise_squashing.rs
index d736de54c..5f79451ce 100644
--- a/tfhe/src/shortint/parameters/noise_squashing.rs
+++ b/tfhe/src/shortint/parameters/noise_squashing.rs
@@ -1,3 +1,4 @@
+use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::shortint::backward_compatibility::parameters::noise_squashing::*;
 use crate::shortint::parameters::{
     CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
@@ -34,3 +35,17 @@ pub struct NoiseSquashingCompressionParameters {
     pub carry_modulus: CarryModulus,
     pub ciphertext_modulus: CoreCiphertextModulus<u128>,
 }
+
+#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize, Versionize)]
+#[versionize(NoiseSquashingMultiBitParametersVersions)]
+pub struct NoiseSquashingMultiBitParameters {
+    pub glwe_dimension: GlweDimension,
+    pub polynomial_size: PolynomialSize,
+    pub glwe_noise_distribution: DynamicDistribution<u128>,
+    pub decomp_base_log: DecompositionBaseLog,
+    pub decomp_level_count: DecompositionLevelCount,
+    pub grouping_factor: LweBskGroupingFactor,
+    pub message_modulus: MessageModulus,
+    pub carry_modulus: CarryModulus,
+    pub ciphertext_modulus: CoreCiphertextModulus<u128>,
+}
diff --git a/tfhe/src/shortint/parameters/v1_3/mod.rs b/tfhe/src/shortint/parameters/v1_3/mod.rs
index 4c4810a73..91958138b 100644
--- a/tfhe/src/shortint/parameters/v1_3/mod.rs
+++ b/tfhe/src/shortint/parameters/v1_3/mod.rs
@@ -43,6 +43,7 @@ pub use noise_squashing::p_fail_2_minus_128::*;
 #[cfg(feature = "hpu")]
 pub use hpu::*;
 
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
 use crate::shortint::parameters::{
     ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
     KeySwitch32PBSParameters, MultiBitPBSParameters, NoiseSquashingCompressionParameters,
@@ -1700,6 +1701,14 @@ pub const VEC_ALL_NOISE_SQUASHING_PARAMETERS: [(&NoiseSquashingParameters, &str)
     ),
 ];
 
+pub const VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS: [(
+    &NoiseSquashingMultiBitParameters,
+    &str,
+); 1] = [(
+    &V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    "V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128",
+)];
+
 /// All [`NoiseSquashingCompressionParameters`] in this module.
 pub const VEC_ALL_NOISE_SQUASHING_COMPRESSION_PARAMETERS: [(
     &NoiseSquashingCompressionParameters,
diff --git a/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs b/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs
index 12a5ca832..d78e5ae52 100644
--- a/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs
+++ b/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs
@@ -1,6 +1,7 @@
+use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
 use crate::shortint::parameters::{
     CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
-    DynamicDistribution, GlweDimension, LweCiphertextCount, MessageModulus,
+    DynamicDistribution, GlweDimension, LweBskGroupingFactor, LweCiphertextCount, MessageModulus,
     ModulusSwitchNoiseReductionParams, ModulusSwitchType, NoiseEstimationMeasureBound,
     NoiseSquashingCompressionParameters, NoiseSquashingParameters, PolynomialSize, RSigmaFactor,
     Variance,
@@ -58,3 +59,16 @@ pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
     carry_modulus: CarryModulus(4),
     ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
 };
+
+pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+    NoiseSquashingMultiBitParameters = NoiseSquashingMultiBitParameters {
+    glwe_dimension: GlweDimension(2),
+    polynomial_size: PolynomialSize(2048),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
+    decomp_base_log: DecompositionBaseLog(23),
+    decomp_level_count: DecompositionLevelCount(3),
+    grouping_factor: LweBskGroupingFactor(4),
+    message_modulus: MessageModulus(4),
+    carry_modulus: CarryModulus(4),
+    ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
+};