feat(gpu): add squash noise in the hlapi

2026-01-09 14:47:56 -05:00 · 2025-05-28 10:28:40 +02:00
parent 13d797fe9b
commit 0d81623a23
35 changed files with 1688 additions and 407 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -24,7 +24,15 @@ using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
      return std::get<Torus *>(variant);                                       \
    }                                                                          \
  }()
-
+// Macro to define the visitor logic using std::holds_alternative for vectors
+#define GET_VARIANT_ELEMENT_64BIT(variant, index)                              \
+  [&] {                                                                        \
+    if (std::holds_alternative<std::vector<uint64_t *>>(variant)) {            \
+      return std::get<std::vector<uint64_t *>>(variant)[index];                \
+    } else {                                                                   \
+      return std::get<uint64_t *>(variant);                                    \
+    }                                                                          \
+  }()
 int get_active_gpu_count(int num_inputs, int gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -567,5 +567,27 @@ void cuda_integer_radix_scalar_mul_high_kb_64(
 void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);
+uint64_t scratch_cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t num_original_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_radix_lwe,
+    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks);
+
+void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
+                                           uint32_t const *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -6,6 +6,8 @@
 #include "integer/radix_ciphertext.h"
 #include "keyswitch/keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
+#include "pbs/programmable_bootstrap_128.cuh"
+#include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>

@@ -528,10 +530,10 @@ template <typename Torus> struct int_radix_lut {
  }

  // Return a pointer to idx-ith degree
-  Torus *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }
+  uint64_t *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }

  // Return a pointer to idx-ith max degree
-  Torus *get_max_degree(size_t idx) { return &max_degrees[idx]; }
+  uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }

  // Return a pointer to idx-ith lut indexes at gpu_index's global memory
  Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
@@ -646,6 +648,206 @@ template <typename Torus> struct int_radix_lut {
    free(max_degrees);
  }
 };
+
+template <typename InputTorus> struct int_noise_squashing_lut {
+
+  int_radix_params params;
+  uint32_t input_glwe_dimension;
+  uint32_t input_polynomial_size;
+  uint32_t input_big_lwe_dimension;
+  uint32_t num_blocks;
+  // Tracks the degree of each LUT and the max degree on CPU
+  // The max degree is (message_modulus * carry_modulus - 1) except for many lut
+  // for which it's different
+  uint64_t *degrees;
+  uint64_t *max_degrees;
+
+  int active_gpu_count;
+
+  // There will be one buffer on each GPU in multi-GPU computations
+  // (same for tmp lwe arrays)
+  std::vector<int8_t *> pbs_buffer;
+
+  std::vector<__uint128_t *> lut_vec;
+
+  uint32_t *gpu_indexes;
+  CudaRadixCiphertextFFI *tmp_lwe_before_ks;
+
+  // All tmp lwe arrays and index arrays for lwe contain the total
+  // amount of blocks to be computed on, there is no split between GPUs
+  // for the moment
+  InputTorus *lwe_indexes_in;
+
+  InputTorus *h_lwe_indexes_in;
+  InputTorus *h_lwe_indexes_out;
+  InputTorus *lwe_trivial_indexes;
+
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<InputTorus *> lwe_array_in_vec;
+  std::vector<InputTorus *> lwe_after_ks_vec;
+  std::vector<__uint128_t *> lwe_after_pbs_vec;
+  std::vector<InputTorus *> lwe_trivial_indexes_vec;
+
+  bool using_trivial_lwe_indexes = true;
+  bool gpu_memory_allocated;
+  // noise squashing constructor
+  int_noise_squashing_lut(cudaStream_t const *streams,
+                          uint32_t const *input_gpu_indexes, uint32_t gpu_count,
+                          int_radix_params params,
+                          uint32_t input_glwe_dimension,
+                          uint32_t input_polynomial_size,
+                          uint32_t num_radix_blocks,
+                          uint32_t original_num_blocks,
+                          bool allocate_gpu_memory, uint64_t *size_tracker) {
+    this->params = params;
+    this->num_blocks = num_radix_blocks;
+    gpu_memory_allocated = allocate_gpu_memory;
+    // This are the glwe dimension and polynomial size before squashing
+    this->input_glwe_dimension = input_glwe_dimension;
+    this->input_polynomial_size = input_polynomial_size;
+    uint32_t input_big_lwe_dimension =
+        input_glwe_dimension * input_polynomial_size;
+    this->input_big_lwe_dimension = input_big_lwe_dimension;
+
+    uint32_t lut_buffer_size = (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(__uint128_t);
+
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
+    ///////////////
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_set_device(i);
+      auto num_radix_blocks_on_gpu =
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      int8_t *gpu_pbs_buffer;
+      uint64_t size = 0;
+      execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
+                              params.small_lwe_dimension, params.glwe_dimension,
+                              params.polynomial_size, params.pbs_level,
+                              num_radix_blocks_on_gpu, allocate_gpu_memory,
+                              params.allocate_ms_array, &size);
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      if (i == 0 && size_tracker != nullptr) {
+        *size_tracker += size;
+      }
+      pbs_buffer.push_back(gpu_pbs_buffer);
+    }
+    lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    h_lwe_indexes_in =
+        (InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
+    for (int i = 0; i < num_radix_blocks; i++)
+      h_lwe_indexes_in[i] = i;
+
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
+        streams[0], gpu_indexes[0], allocate_gpu_memory);
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        lwe_trivial_indexes, h_lwe_indexes_in,
+        num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
+        allocate_gpu_memory);
+
+    multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                              lwe_array_in_vec, num_radix_blocks,
+                              params.big_lwe_dimension + 1, size_tracker,
+                              allocate_gpu_memory);
+
+    multi_gpu_alloc_lwe_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
+        num_radix_blocks, params.small_lwe_dimension + 1, size_tracker,
+        allocate_gpu_memory);
+    multi_gpu_alloc_lwe_async<__uint128_t>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        num_radix_blocks, params.big_lwe_dimension + 1, size_tracker,
+        allocate_gpu_memory);
+    multi_gpu_alloc_array_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
+        num_radix_blocks, size_tracker, allocate_gpu_memory);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+                               lwe_trivial_indexes_vec, lwe_trivial_indexes,
+                               num_radix_blocks, allocate_gpu_memory);
+    if (allocate_gpu_memory) {
+      // Allocate LUT
+      // LUT is used as a trivial encryption and must be initialized outside
+      // this constructor
+      for (uint i = 0; i < active_gpu_count; i++) {
+        auto lut = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+            lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
+            allocate_gpu_memory);
+        lut_vec.push_back(lut);
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+    }
+    // Keyswitch
+    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<InputTorus>(
+        streams[0], gpu_indexes[0], tmp_lwe_before_ks, original_num_blocks,
+        input_big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    degrees = (uint64_t *)malloc(sizeof(uint64_t));
+    max_degrees = (uint64_t *)malloc(sizeof(uint64_t));
+
+    // lut for the squashing
+    auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };
+
+    // Generate the identity LUT, for now we only use one GPU
+    for (uint i = 0; i < active_gpu_count; i++) {
+      auto squash_lut = lut_vec[i];
+      generate_device_accumulator<__uint128_t>(
+          streams[i], gpu_indexes[i], squash_lut, degrees, max_degrees,
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, f_squash, allocate_gpu_memory);
+    }
+  }
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    free(this->gpu_indexes);
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_drop_with_size_tracking_async(lut_vec[i], streams[i], gpu_indexes[i],
+                                         gpu_memory_allocated);
+    }
+    cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    lut_vec.clear();
+    free(h_lwe_indexes_in);
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   tmp_lwe_before_ks, gpu_memory_allocated);
+    for (int i = 0; i < pbs_buffer.size(); i++) {
+      cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
+                                              &pbs_buffer[i]);
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+
+    multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
+    multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
+    multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
+    multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
+    for (uint i = 0; i < active_gpu_count; i++)
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    lwe_array_in_vec.clear();
+    lwe_after_ks_vec.clear();
+    lwe_after_pbs_vec.clear();
+    lwe_trivial_indexes_vec.clear();
+
+    delete tmp_lwe_before_ks;
+    pbs_buffer.clear();
+  }
+};
+
 template <typename Torus> struct int_bit_extract_luts_buffer {
  int_radix_params params;
  int_radix_lut<Torus> *lut;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
@@ -0,0 +1,13 @@
+#ifndef CUDA_BOOTSTRAP_128_H
+#define CUDA_BOOTSTRAP_128_H
+
+#include "pbs_enums.h"
+#include <stdint.h>
+
+uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+#endif // CUDA_BOOTSTRAP_128_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -240,14 +240,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <PBS_TYPE pbs_type> struct pbs_buffer_128;
-
-template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
+template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
  double *global_join_buffer;
-  __uint128_t *temp_lwe_array_in;
+  InputTorus *temp_lwe_array_in;
  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
@@ -265,11 +263,9 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
    this->pbs_variant = pbs_variant;
    this->uses_noise_reduction = allocate_ms_array;
    if (allocate_ms_array) {
-      this->temp_lwe_array_in =
-          (__uint128_t *)cuda_malloc_with_size_tracking_async(
-              (lwe_dimension + 1) * input_lwe_ciphertext_count *
-                  sizeof(__uint128_t),
-              stream, gpu_index, size_tracker, allocate_ms_array);
+      this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
+          (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
+          stream, gpu_index);
      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
          size_tracker, allocate_ms_array);
@@ -525,6 +521,10 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
                                                    uint32_t level_count,
                                                    uint32_t max_shared_memory);

+bool has_support_to_cuda_programmable_bootstrap_128_cg(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory);
+
 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                         int glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -100,7 +100,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
+    void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples);

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -178,10 +178,12 @@ __device__ __forceinline__ double measure_modulus_switch_noise(

 // Each thread processes two elements of the lwe array
 template <typename Torus>
-__global__ void improve_noise_modulus_switch(
-    Torus *array_out, const Torus *array_in, const uint64_t *indexes,
-    const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus) {
+__global__ void __launch_bounds__(512)
+    improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
+                                 const uint64_t *indexes, const Torus *zeros,
+                                 int lwe_size, int num_zeros,
+                                 double input_variance, double r_sigma,
+                                 double bound, uint32_t log_modulus) {

  // First we will assume size is less than the number of threads per block
  // I should switch this to dynamic shared memory
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -386,3 +386,69 @@ void reverseArray(uint64_t arr[], size_t n) {
    end--;
  }
 }
+
+uint64_t scratch_cuda_apply_noise_squashing_mem(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_radix_blocks, uint32_t original_num_blocks,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_noise_squashing_lut<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
+      polynomial_size, num_radix_blocks, original_num_blocks,
+      allocate_gpu_memory, &size_tracker);
+  return size_tracker;
+}
+
+uint64_t scratch_cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t original_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+  PUSH_RANGE("scratch noise squashing")
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_cuda_apply_noise_squashing_mem(
+      streams, gpu_indexes, gpu_count, params,
+      (int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
+      input_polynomial_size, num_radix_blocks, original_num_blocks,
+      allocate_gpu_memory);
+  POP_RANGE()
+}
+
+void cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_radix_lwe,
+    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks) {
+
+  PUSH_RANGE("apply noise squashing")
+  integer_radix_apply_noise_squashing_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
+      input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)ksks, ms_noise_reduction_key);
+  POP_RANGE()
+}
+
+void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
+                                           uint32_t const *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup noise squashing")
+  int_noise_squashing_lut<uint64_t> *mem_ptr =
+      (int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -9,6 +9,7 @@
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
 #include "linearalgebra/negation.cuh"
+#include "pbs/pbs_128_utilities.h"
 #include "pbs/programmable_bootstrap.h"
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
@@ -866,7 +867,7 @@ uint64_t generate_lookup_table_with_encoding(
  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];
-  uint64_t degree = 0;
+  Torus degree = 0;

  // This accumulator extracts the carry bits
  for (int i = 0; i < input_modulus_sup; i++) {
@@ -886,7 +887,7 @@ uint64_t generate_lookup_table_with_encoding(
  }

  rotate_left<Torus>(body, half_box_size, polynomial_size);
-  return degree;
+  return (uint64_t)degree;
 }

 template <typename Torus>
@@ -2200,4 +2201,110 @@ void host_single_borrow_propagate(
  }
 }

+/// num_radix_blocks corresponds to the number of blocks on which to apply the
+/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
+/// the input and output numbers of blocks
+template <typename InputTorus>
+__host__ void integer_radix_apply_noise_squashing_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in,
+    int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
+    InputTorus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
+
+  PUSH_RANGE("apply noise squashing")
+  auto params = lut->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+
+  if (lwe_array_out->num_radix_blocks !=
+      (lwe_array_in->num_radix_blocks + 1) / 2)
+    PANIC("Cuda error: num output radix blocks should be "
+          "half ceil the number input radix blocks")
+
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
+  std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<InputTorus *> lwe_trivial_indexes_vec =
+      lut->lwe_trivial_indexes_vec;
+
+  // We know carry is empty so we can pack two blocks in one
+  pack_blocks<InputTorus>(streams[0], gpu_indexes[0], lwe_array_pbs_in,
+                          lwe_array_in, lwe_array_in->num_radix_blocks,
+                          params.message_modulus);
+
+  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
+  // from the output ct
+  auto active_gpu_count =
+      get_active_gpu_count(lwe_array_out->num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<InputTorus>(
+        streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
+        lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
+        small_lwe_dimension, ks_base_log, ks_level,
+        lwe_array_out->num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_128_async<__uint128_t>(
+        streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
+        lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
+        lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
+        pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
+  } else {
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
+        (InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
+        lut->input_big_lwe_dimension + 1);
+
+    execute_keyswitch_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
+        lwe_trivial_indexes_vec, lwe_array_in_vec, lwe_trivial_indexes_vec,
+        ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
+        ks_level, lwe_array_out->num_radix_blocks);
+
+    execute_pbs_128_async<__uint128_t>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
+        lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
+        small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, lwe_array_out->num_radix_blocks);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<__uint128_t>(
+        streams, gpu_indexes, active_gpu_count,
+        (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
+        (__uint128_t *)lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
+  for (uint i = 0; i < lut->num_blocks; i++) {
+    lwe_array_out->degrees[i] = lut->degrees[0];
+    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
+  }
+  POP_RANGE()
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
@@ -0,0 +1,46 @@
+#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
+#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
+#include "pbs/pbs_128_utilities.h"
+
+static void
+execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+                        uint32_t lwe_dimension, uint32_t glwe_dimension,
+                        uint32_t polynomial_size, uint32_t level_count,
+                        uint32_t input_lwe_ciphertext_count,
+                        bool allocate_gpu_memory, bool allocate_ms_array,
+                        uint64_t *size_tracker_on_gpu) {
+  // The squash noise function receives as input 64-bit integers
+  *size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
+      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
+      polynomial_size, level_count, input_lwe_ciphertext_count,
+      allocate_gpu_memory, allocate_ms_array);
+}
+template <typename Torus>
+static void execute_pbs_128_async(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
+    const std::vector<Torus *> lut_vector,
+    const LweArrayVariant<uint64_t> &lwe_array_in,
+    void *const *bootstrapping_keys,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples) {
+
+  for (uint32_t i = 0; i < gpu_count; i++) {
+    int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
+
+    Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+    uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
+    void *zeros = nullptr;
+    if (ms_noise_reduction_key != nullptr)
+      zeros = ms_noise_reduction_key->ptr[i];
+
+    cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+        streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
+        current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
+        zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
+        base_log, level_count, num_inputs_on_gpu);
+  }
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -665,7 +665,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    if (ms_noise_reduction_key->num_zeros != 0) {
      uint32_t log_modulus = log2(polynomial_size) + 1;
      host_improve_noise_modulus_switch<uint64_t>(
-          static_cast<cudaStream_t>(stream), gpu_index,
+          static_cast<const cudaStream_t>(stream), gpu_index,
          buffer->temp_lwe_array_in,
          static_cast<uint64_t const *>(lwe_array_in),
          static_cast<uint64_t const *>(lwe_input_indexes),
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -8,124 +8,67 @@ bool has_support_to_cuda_programmable_bootstrap_128_cg(
      max_shared_memory);
 }

-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
- * the GPU in case FULLSM or PARTIALSM mode is going to be used.
- */
+uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  return scratch_cuda_programmable_bootstrap_128_vector<uint64_t>(
+      stream, gpu_index,
+      (pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> **)pbs_buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+}
+
 uint64_t scratch_cuda_programmable_bootstrap_128(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {

-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
-
-  if (has_support_to_cuda_programmable_bootstrap_128_cg(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory)) {
-    switch (polynomial_size) {
-    case 256:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 512:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 1024:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 2048:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 4096:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    default:
-      PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
-            "Supported N's are powers of two"
-            " in the interval [256..4096].")
-    }
-  } else {
-    switch (polynomial_size) {
-    case 256:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 512:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 1024:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 2048:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 4096:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    default:
-      PANIC("Cuda error (classical PBS): unsupported polynomial size. "
-            "Supported N's are powers of two"
-            " in the interval [256..4096].")
-    }
-  }
+  return scratch_cuda_programmable_bootstrap_128_vector_64(
+      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
+      polynomial_size, level_count, input_lwe_ciphertext_count,
+      allocate_gpu_memory, allocate_ms_array);
 }

-template <typename Torus>
+template <typename InputTorus>
 void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lut_vector, Torus *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_128<AmortizedDegree<256>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_128<AmortizedDegree<512>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_128<AmortizedDegree<1024>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_128<AmortizedDegree<2048>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
-    host_programmable_bootstrap_128<AmortizedDegree<4096>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
@@ -137,41 +80,42 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
  }
 }

-template <typename Torus>
+template <typename InputTorus>
 void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lut_vector, Torus *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
@@ -183,6 +127,57 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
  }
 }

+template <typename InputTorus>
+void host_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    __uint128_t const *lut_vector, void const *lwe_array_in,
+    void const *bootstrapping_key,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void const *ms_noise_reduction_ptr,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")
+
+  // If the parameters contain noise reduction key, then apply it
+  if (ms_noise_reduction_key->num_zeros != 0) {
+    uint32_t log_modulus = log2(polynomial_size) + 1;
+    host_improve_noise_modulus_switch<InputTorus>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<InputTorus const *>(lwe_array_in),
+        static_cast<uint64_t const *>(buffer->trivial_indexes),
+        static_cast<const InputTorus *>(ms_noise_reduction_ptr),
+        lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
+        ms_noise_reduction_key->ms_input_variance,
+        ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
+        log_modulus);
+  } else {
+    buffer->temp_lwe_array_in =
+        const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
+  }
+  switch (buffer->pbs_variant) {
+  case DEFAULT:
+    executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+    break;
+  case CG:
+    executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
+        InputTorus>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+    break;
+  default:
+    PANIC("Cuda error (PBS): unknown pbs variant.")
+  }
+}
+
 /* Perform bootstrapping on a batch of input u128 LWE ciphertexts, storing the
 * result in the same index for each ciphertext.
 *
@@ -237,57 +232,22 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
 */

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *streams, uint32_t gpu_index, void *lwe_array_out,
    void const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
+    void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples) {
-  if (base_log > 64)
-    PANIC("Cuda error (classical PBS): base log should be <= 64")
+  pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
+      (pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *)mem_ptr;

-  pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
-
-  // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key->num_zeros != 0) {
-    uint32_t log_modulus = log2(polynomial_size) + 1;
-    host_improve_noise_modulus_switch<__uint128_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
-        static_cast<__uint128_t const *>(lwe_array_in),
-        static_cast<uint64_t const *>(buffer->trivial_indexes),
-        static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
-        lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
-        ms_noise_reduction_key->ms_input_variance,
-        ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
-        log_modulus);
-  } else {
-    buffer->temp_lwe_array_in = const_cast<__uint128_t *>(
-        static_cast<const __uint128_t *>(lwe_array_in));
-  }
-
-  switch (buffer->pbs_variant) {
-  case DEFAULT:
-    executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
-        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
-        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
-    break;
-  case CG:
-    executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
-        __uint128_t>(
-        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
-        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
-    break;
-  default:
-    PANIC("Cuda error (PBS): unknown pbs variant.")
-  }
+  host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
+      streams, gpu_index, lwe_array_out,
+      static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
+      bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
+      num_samples);
 }

 /*
@@ -296,6 +256,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
 */
 void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
                                             int8_t **buffer) {
-  auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
+  auto x = (pbs_buffer_128<__uint128_t, PBS_TYPE::CLASSICAL> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -74,16 +74,17 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_128(
  __syncthreads();
 }

-template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
+template <typename InputTorus, class params, sharedMemDegree SMD,
+          bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_programmable_bootstrap_step_one_128(
-        const Torus *__restrict__ lut_vector,
-        const Torus *__restrict__ lwe_array_in,
-        const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
-        double *global_join_buffer, uint32_t lwe_iteration,
-        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-        uint32_t level_count, int8_t *device_mem,
-        uint64_t device_memory_size_per_block) {
+        const __uint128_t *__restrict__ lut_vector,
+        const InputTorus *__restrict__ lwe_array_in,
+        const double *__restrict__ bootstrapping_key,
+        __uint128_t *global_accumulator, double *global_join_buffer,
+        uint32_t lwe_iteration, uint32_t lwe_dimension,
+        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+        int8_t *device_mem, uint64_t device_memory_size_per_block) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -100,22 +101,22 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    selected_memory = &device_mem[block_index * device_memory_size_per_block];
  }

-  Torus *accumulator = (Torus *)selected_memory;
+  __uint128_t *accumulator = (__uint128_t *)selected_memory;
  double *accumulator_fft =
      (double *)accumulator +
-      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
+      (ptrdiff_t)(sizeof(__uint128_t) * polynomial_size / sizeof(double));

  if constexpr (SMD == PARTIALSM)
    accumulator_fft = (double *)sharedmem;

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];

-  const Torus *block_lut_vector = lut_vector;
+  const __uint128_t *block_lut_vector = lut_vector;

-  Torus *global_slice =
+  __uint128_t *global_slice =
      global_accumulator +
      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;

@@ -127,12 +128,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  if constexpr (first_iter) {
    // First iteration
    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    InputTorus b_hat = 0;
+    modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
+                               params::log2_degree + 1);
    // The y-dimension is used to select the element of the GLWE this block will
    // compute
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+    divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
        false);
@@ -146,20 +147,21 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  }

  // Put "a" in [0, 2N[
-  Torus a_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
-                 params::log2_degree + 1); // 2 * params::log2_degree + 1);
+  InputTorus a_hat = 0;
+  modulus_switch<InputTorus>(block_lwe_array_in[lwe_iteration], a_hat,
+                             params::log2_degree +
+                                 1); // 2 * params::log2_degree + 1);

  __syncthreads();

  // Perform ACC * (X^ä - 1)
  multiply_by_monomial_negacyclic_and_sub_polynomial<
-      Torus, params::opt, params::degree / params::opt>(global_slice,
-                                                        accumulator, a_hat);
+      __uint128_t, params::opt, params::degree / params::opt>(
+      global_slice, accumulator, a_hat);

  // Perform a rounding to increase the accuracy of the
  // bootstrapped ciphertext
-  init_decomposer_state_inplace<Torus, params::opt,
+  init_decomposer_state_inplace<__uint128_t, params::opt,
                                params::degree / params::opt>(
      accumulator, base_log, level_count);

@@ -168,7 +170,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // Decompose the accumulator. Each block gets one level of the
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
+                                               accumulator);
  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

  // We are using the same memory space for accumulator_fft and
@@ -314,10 +317,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 *
 * Each y-block computes one element of the lwe_array_out.
 */
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_cg_128(
-    Torus *lwe_array_out, const Torus *__restrict__ lut_vector,
-    const Torus *__restrict__ lwe_array_in,
+    __uint128_t *lwe_array_out, const __uint128_t *__restrict__ lut_vector,
+    const InputTorus *__restrict__ lwe_array_in,
    const double *__restrict__ bootstrapping_key, double *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
@@ -342,23 +345,22 @@ __global__ void device_programmable_bootstrap_cg_128(

  // We always compute the pointer with most restrictive alignment to avoid
  // alignment issues
-  Torus *accumulator = (Torus *)selected_memory;
-  Torus *accumulator_rotated =
-      (Torus *)accumulator + (ptrdiff_t)(polynomial_size);
+  __uint128_t *accumulator = (__uint128_t *)selected_memory;
+  __uint128_t *accumulator_rotated =
+      (__uint128_t *)accumulator + (ptrdiff_t)(polynomial_size);
  double *accumulator_fft =
      (double *)(accumulator_rotated) +
-      (ptrdiff_t)(polynomial_size * sizeof(Torus) / sizeof(double));
+      (ptrdiff_t)(polynomial_size * sizeof(__uint128_t) / sizeof(double));

  if constexpr (SMD == PARTIALSM)
    accumulator_fft = (double *)sharedmem;

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];

-  const Torus *block_lut_vector =
-      &lut_vector[blockIdx.x * params::degree * (glwe_dimension + 1)];
+  const __uint128_t *block_lut_vector = lut_vector;

  double *block_join_buffer =
      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
@@ -368,11 +370,11 @@ __global__ void device_programmable_bootstrap_cg_128(
  // rotated array is not in use anymore by the time we perform the fft

  // Put "b" in [0, 2N[
-  Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  InputTorus b_hat = 0;
+  modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
+                             params::log2_degree + 1);

-  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+  divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
                                        params::degree / params::opt>(
      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
      false);
@@ -381,17 +383,18 @@ __global__ void device_programmable_bootstrap_cg_128(
    __syncthreads();

    // Put "a" in [0, 2N[
-    Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
+    InputTorus a_hat = 0;
+    modulus_switch<InputTorus>(block_lwe_array_in[i], a_hat,
+                               params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
-        Torus, params::opt, params::degree / params::opt>(
+        __uint128_t, params::opt, params::degree / params::opt>(
        accumulator, accumulator_rotated, a_hat);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
-    init_decomposer_state_inplace<Torus, params::opt,
+    init_decomposer_state_inplace<__uint128_t, params::opt,
                                  params::degree / params::opt>(
        accumulator_rotated, base_log, level_count);

@@ -400,8 +403,8 @@ __global__ void device_programmable_bootstrap_cg_128(
    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated);
+    GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
+                                                 accumulator_rotated);
    gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

    auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
@@ -420,8 +423,9 @@ __global__ void device_programmable_bootstrap_cg_128(
        acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
    __syncthreads();

-    add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
-                                    acc_fft_im_lo, accumulator);
+    add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
+                                          acc_fft_im_hi, acc_fft_im_lo,
+                                          accumulator);
  }

  auto block_lwe_array_out =
@@ -433,17 +437,20 @@ __global__ void device_programmable_bootstrap_cg_128(
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
+                                               accumulator);

    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
+                                               0);
    }
  }
 }

-template <typename params>
+template <typename InputTorus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_cg_128(
-    cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
+    cudaStream_t stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {
@@ -457,33 +464,34 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }

  uint64_t size_tracker = 0;
-  *buffer = new pbs_buffer_128<CLASSICAL>(
+  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

-template <typename params>
+template <typename InputTorus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_128(
-    cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
+    cudaStream_t stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {
@@ -504,37 +512,37 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
  // Configure step one
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
@@ -580,17 +588,122 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
  }

  uint64_t size_tracker = 0;
-  *buffer = new pbs_buffer_128<CLASSICAL>(
+  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

-template <class params, bool first_iter>
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
+ * the GPU in case FULLSM or PARTIALSM mode is going to be used.
+ */
+template <typename InputTorus>
+uint64_t scratch_cuda_programmable_bootstrap_128_vector(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto buffer = (pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **)pbs_buffer;
+
+  if (has_support_to_cuda_programmable_bootstrap_128_cg(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory)) {
+    switch (polynomial_size) {
+    case 256:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<256>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 512:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<512>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 1024:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<1024>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 2048:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<2048>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 4096:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<4096>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    default:
+      PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
+            "Supported N's are powers of two"
+            " in the interval [256..4096].")
+    }
+  } else {
+    switch (polynomial_size) {
+    case 256:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<256>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 512:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<512>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 1024:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<1024>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 2048:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<2048>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 4096:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<4096>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    default:
+      PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+            "Supported N's are powers of two"
+            " in the interval [256..4096].")
+    }
+  }
+}
+
+template <typename InputTorus, class params, bool first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
-    __uint128_t *lwe_array_in, double const *bootstrapping_key,
+    InputTorus *lwe_array_in, double const *bootstrapping_key,
    __uint128_t *global_accumulator, double *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -603,21 +716,21 @@ __host__ void execute_step_one_128(
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);

  if (max_shared_memory < partial_sm) {
-    device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM,
+    device_programmable_bootstrap_step_one_128<InputTorus, params, NOSM,
                                               first_iter>
        <<<grid, thds, 0, stream>>>(
            lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
            global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
            base_log, level_count, d_mem, full_dm);
  } else if (max_shared_memory < full_sm) {
-    device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM,
+    device_programmable_bootstrap_step_one_128<InputTorus, params, PARTIALSM,
                                               first_iter>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
            global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
            base_log, level_count, d_mem, partial_dm);
  } else {
-    device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+    device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                               first_iter>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
@@ -670,11 +783,12 @@ __host__ void execute_step_two_128(
 /*
 * Host wrapper to the programmable bootstrap 128
 */
-template <class params>
+template <typename InputTorus, class params>
 __host__ void host_programmable_bootstrap_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, __uint128_t *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *pbs_buffer,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
@@ -704,14 +818,14 @@ __host__ void host_programmable_bootstrap_128(

  for (int i = 0; i < lwe_dimension; i++) {
    if (i == 0) {
-      execute_step_one_128<params, true>(
+      execute_step_one_128<InputTorus, params, true>(
          stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
          global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
          lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
          d_mem, i, partial_sm, partial_dm_step_one, full_sm_step_one,
          full_dm_step_one);
    } else {
-      execute_step_one_128<params, false>(
+      execute_step_one_128<InputTorus, params, false>(
          stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
          global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
          lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
@@ -736,11 +850,12 @@ __host__ void host_programmable_bootstrap_128(
  }
 }

-template <class params>
+template <typename InputTorus, class params>
 __host__ void host_programmable_bootstrap_cg_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, __uint128_t const *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
+    __uint128_t const *lut_vector, InputTorus const *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
@@ -783,20 +898,20 @@ __host__ void host_programmable_bootstrap_cg_128(
  if (max_shared_memory < partial_sm) {
    kernel_args[10] = &full_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg_128<__uint128_t, params, NOSM>,
+        (void *)device_programmable_bootstrap_cg_128<InputTorus, params, NOSM>,
        grid, thds, (void **)kernel_args, 0, stream));
  } else if (max_shared_memory < full_sm) {
    kernel_args[10] = &partial_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg_128<__uint128_t, params,
-                                                     PARTIALSM>,
+        (void *)
+            device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
        grid, thds, (void **)kernel_args, partial_sm, stream));
  } else {
    int no_dm = 0;
    kernel_args[10] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)
-            device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
+            device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
        grid, thds, (void **)kernel_args, full_sm, stream));
  }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -62,6 +62,12 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
  }
 }

+template void multi_gpu_alloc_lwe_async<__uint128_t>(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
+    uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
+    bool allocate_gpu_memory);
+
 /// Allocates the input/output vector for all devices
 /// Initializes also the related indexing and initializes it to the trivial
 /// index
@@ -219,5 +225,9 @@ void multi_gpu_release_async(cudaStream_t const *streams,
  for (uint i = 0; i < vec.size(); i++)
    cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
 }
+template void
+multi_gpu_release_async<__uint128_t>(cudaStream_t const *streams,
+                                     uint32_t const *gpu_indexes,
+                                     std::vector<__uint128_t *> &vec);

 #endif
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1316,6 +1316,14 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn extend_radix_with_trivial_zero_blocks_msb_64(
+        output: *mut CudaRadixCiphertextFFI,
+        input: *const CudaRadixCiphertextFFI,
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+    );
+}
 unsafe extern "C" {
    pub fn trim_radix_blocks_lsb_64(
        output: *mut CudaRadixCiphertextFFI,
@@ -1325,11 +1333,49 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn extend_radix_with_trivial_zero_blocks_msb_64(
-        output: *mut CudaRadixCiphertextFFI,
-        input: *const CudaRadixCiphertextFFI,
+    pub fn scratch_cuda_apply_noise_squashing_kb(
        streams: *const *mut ffi::c_void,
        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        input_glwe_dimension: u32,
+        input_polynomial_size: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_radix_blocks: u32,
+        num_original_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        allocate_ms_array: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_apply_noise_squashing_kb(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        output_radix_lwe: *mut CudaRadixCiphertextFFI,
+        input_radix_lwe: *const CudaRadixCiphertextFFI,
+        mem_ptr: *mut i8,
+        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
+        bsks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_apply_noise_squashing_kb(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
@@ -1927,7 +1973,7 @@ unsafe extern "C" {
        lwe_array_in: *const ffi::c_void,
        bootstrapping_key: *const ffi::c_void,
        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
-        ms_noise_reduction_ptr: *mut ffi::c_void,
+        ms_noise_reduction_ptr: *const ffi::c_void,
        buffer: *mut i8,
        lwe_dimension: u32,
        glwe_dimension: u32,
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -177,7 +177,10 @@ mod cuda {
    };
    use tfhe::core_crypto::prelude::*;
    use tfhe::shortint::engine::ShortintEngine;
-    use tfhe::shortint::parameters::ModulusSwitchNoiseReductionParams;
+    use tfhe::shortint::parameters::{
+        NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    };
    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;

    fn cuda_pbs_128(c: &mut Criterion) {
@@ -188,26 +191,12 @@ mod cuda {
            .measurement_time(std::time::Duration::from_secs(30));

        type Scalar = u128;
+        let input_params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let squash_params = NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

-        let lwe_dimension = LweDimension(879);
-        let glwe_dimension = GlweDimension(2);
-        let polynomial_size = PolynomialSize(2048);
-        let lwe_noise_distribution = DynamicDistribution::new_t_uniform(46);
-        let lwe_noise_distribution_u128: DynamicDistribution<u128> =
-            DynamicDistribution::new_t_uniform(46);
-        let glwe_noise_distribution = DynamicDistribution::new_t_uniform(30);
-        let pbs_base_log = DecompositionBaseLog(24);
-        let pbs_level = DecompositionLevelCount(3);
-        let ciphertext_modulus = CiphertextModulus::new_native();
+        let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
        let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();

-        let modulus_switch_noise_reduction_params = ModulusSwitchNoiseReductionParams {
-            modulus_switch_zeros_count: LweCiphertextCount(1449),
-            ms_bound: NoiseEstimationMeasureBound(288230376151711744f64),
-            ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64),
-            ms_input_variance: Variance(2.63039184094559E-7f64),
-        };
-
        let params_name = "PARAMS_SWITCH_SQUASH";

        let mut boxed_seeder = new_seeder();
@@ -220,20 +209,11 @@ mod cuda {
            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);

        let input_lwe_secret_key =
-            LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_generator);
-
-        let input_lwe_secret_key_u128 = LweSecretKey::from_container(
-            input_lwe_secret_key
-                .as_ref()
-                .iter()
-                .copied()
-                .map(|x| x as u128)
-                .collect::<Vec<_>>(),
-        );
+            LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);

        let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
-            glwe_dimension,
-            polynomial_size,
+            squash_params.glwe_dimension,
+            squash_params.polynomial_size,
            &mut secret_generator,
        );

@@ -241,29 +221,32 @@ mod cuda {

        let bsk = LweBootstrapKey::new(
            Scalar::ZERO,
-            glwe_dimension.to_glwe_size(),
-            polynomial_size,
-            pbs_base_log,
-            pbs_level,
-            lwe_dimension,
-            ciphertext_modulus,
+            squash_params.glwe_dimension.to_glwe_size(),
+            squash_params.polynomial_size,
+            squash_params.decomp_base_log,
+            squash_params.decomp_level_count,
+            LweDimension(input_params.lwe_dimension.0),
+            squash_params.ciphertext_modulus,
        );

        let mut engine = ShortintEngine::new();

-        let modulus_switch_noise_reduction_key = Some(ModulusSwitchNoiseReductionKey::new(
-            modulus_switch_noise_reduction_params,
-            &input_lwe_secret_key,
-            &mut engine,
-            CiphertextModulus::new_native(),
-            lwe_noise_distribution,
-        ));
-
+        let modulus_switch_noise_reduction_key = squash_params
+            .modulus_switch_noise_reduction_params
+            .map(|modulus_switch_noise_reduction_params| {
+                ModulusSwitchNoiseReductionKey::new(
+                    modulus_switch_noise_reduction_params,
+                    &input_lwe_secret_key,
+                    &mut engine,
+                    input_params.ciphertext_modulus,
+                    input_params.lwe_noise_distribution,
+                )
+            });
        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();

-        let message_modulus: Scalar = 1 << 4;
-        let input_message: Scalar = 3;
-        let delta: Scalar = (1 << (Scalar::BITS - 1)) / message_modulus;
+        let message_modulus: u64 = 1 << 4;
+        let input_message: u64 = 3;
+        let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
        let plaintext = Plaintext(input_message * delta);

        let bench_id;
@@ -277,12 +260,12 @@ mod cuda {
                    &streams,
                );

-                let lwe_ciphertext_in: LweCiphertextOwned<Scalar> =
+                let lwe_ciphertext_in: LweCiphertextOwned<u64> =
                    allocate_and_encrypt_new_lwe_ciphertext(
-                        &input_lwe_secret_key_u128,
+                        &input_lwe_secret_key,
                        plaintext,
-                        lwe_noise_distribution_u128,
-                        ciphertext_modulus,
+                        lwe_noise_distribution_u64,
+                        ct_modulus_u64,
                        &mut encryption_generator,
                    );
                let lwe_ciphertext_in_gpu =
@@ -290,9 +273,9 @@ mod cuda {

                let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
                    Scalar::ONE,
-                    glwe_dimension.to_glwe_size(),
-                    polynomial_size,
-                    ciphertext_modulus,
+                    squash_params.glwe_dimension.to_glwe_size(),
+                    squash_params.polynomial_size,
+                    squash_params.ciphertext_modulus,
                );
                let accumulator_gpu =
                    CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
@@ -300,7 +283,7 @@ mod cuda {
                let out_pbs_ct = LweCiphertext::new(
                    Scalar::ZERO,
                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
-                    ciphertext_modulus,
+                    squash_params.ciphertext_modulus,
                );
                let mut out_pbs_ct_gpu =
                    CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
@@ -337,22 +320,22 @@ mod cuda {
                        let local_streams = cuda_local_streams_core();

                        let plaintext_list =
-                            PlaintextList::new(Scalar::ZERO, PlaintextCount(elements_per_stream));
+                            PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));

                        let input_cts = (0..gpu_count)
                            .map(|i| {
                                let mut input_ct_list = LweCiphertextList::new(
-                                    Scalar::ZERO,
+                                    u64::ZERO,
                                    input_lwe_secret_key.lwe_dimension().to_lwe_size(),
                                    LweCiphertextCount(elements_per_stream),
-                                    ciphertext_modulus,
+                                    ct_modulus_u64,
                                );

                                encrypt_lwe_ciphertext_list(
-                                    &input_lwe_secret_key_u128,
+                                    &input_lwe_secret_key,
                                    &mut input_ct_list,
                                    &plaintext_list,
-                                    lwe_noise_distribution_u128,
+                                    lwe_noise_distribution_u64,
                                    &mut encryption_generator,
                                );

@@ -367,9 +350,9 @@ mod cuda {
                            .map(|i| {
                                let accumulator = GlweCiphertextOwned::new(
                                    Scalar::ONE,
-                                    glwe_dimension.to_glwe_size(),
-                                    polynomial_size,
-                                    ciphertext_modulus,
+                                    squash_params.glwe_dimension.to_glwe_size(),
+                                    squash_params.polynomial_size,
+                                    squash_params.ciphertext_modulus,
                                );
                                CudaGlweCiphertextList::from_glwe_ciphertext(
                                    &accumulator,
@@ -385,7 +368,7 @@ mod cuda {
                                    Scalar::ZERO,
                                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
                                    LweCiphertextCount(elements_per_stream),
-                                    ciphertext_modulus,
+                                    squash_params.ciphertext_modulus,
                                );
                                CudaLweCiphertextList::from_lwe_ciphertext_list(
                                    &output_ct_list,
@@ -428,14 +411,14 @@ mod cuda {
        };

        let params_record = CryptoParametersRecord {
-            lwe_dimension: Some(lwe_dimension),
-            glwe_dimension: Some(glwe_dimension),
-            polynomial_size: Some(polynomial_size),
-            lwe_noise_distribution: Some(lwe_noise_distribution),
-            glwe_noise_distribution: Some(glwe_noise_distribution),
-            pbs_base_log: Some(pbs_base_log),
-            pbs_level: Some(pbs_level),
-            ciphertext_modulus: Some(ct_modulus_u64),
+            lwe_dimension: Some(input_params.lwe_dimension),
+            glwe_dimension: Some(squash_params.glwe_dimension),
+            polynomial_size: Some(squash_params.polynomial_size),
+            lwe_noise_distribution: Some(lwe_noise_distribution_u64),
+            glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
+            pbs_base_log: Some(squash_params.decomp_base_log),
+            pbs_level: Some(squash_params.decomp_level_count),
+            ciphertext_modulus: Some(input_params.ciphertext_modulus),
            ..Default::default()
        };

--- a/tfhe/src/core_crypto/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/algorithms/test/mod.rs
@@ -3,8 +3,6 @@ use super::misc::check_encrypted_content_respects_mod;
 use crate::core_crypto::algorithms::misc::divide_round;
 use crate::core_crypto::keycache::KeyCacheAccess;
 use crate::core_crypto::prelude::*;
-#[cfg(feature = "gpu")]
-use crate::shortint::parameters::ModulusSwitchNoiseReductionParams;
 pub(crate) use params::*;
 use std::fmt::Debug;

@@ -431,25 +429,6 @@ pub const FFT_WOPBS_N2048_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams
    ciphertext_modulus: CiphertextModulus::new_native(),
 };

-#[cfg(feature = "gpu")]
-pub const NOISESQUASHING128_U128_GPU_PARAMS: NoiseSquashingTestParams<u128> =
-    NoiseSquashingTestParams {
-        lwe_dimension: LweDimension(879),
-        glwe_dimension: GlweDimension(2),
-        polynomial_size: PolynomialSize(2048),
-        lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
-        glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
-        pbs_base_log: DecompositionBaseLog(24),
-        pbs_level: DecompositionLevelCount(3),
-        modulus_switch_noise_reduction_params: Some(ModulusSwitchNoiseReductionParams {
-            modulus_switch_zeros_count: LweCiphertextCount(1449),
-            ms_bound: NoiseEstimationMeasureBound(288230376151711744f64),
-            ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64),
-            ms_input_variance: Variance(2.63039184094559E-7f64),
-        }),
-        ciphertext_modulus: CiphertextModulus::new_native(),
-    };
-
 pub fn get_encoding_with_padding<Scalar: UnsignedInteger>(
    ciphertext_modulus: CiphertextModulus<Scalar>,
 ) -> Scalar {
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
@@ -156,7 +156,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
 ///   be dropped until streams is synchronised
 #[allow(clippy::too_many_arguments)]
 pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+    input: &CudaLweCiphertextList<u64>,
    output: &mut CudaLweCiphertextList<Scalar>,
    accumulator: &CudaGlweCiphertextList<Scalar>,
    num_samples: LweCiphertextCount,
@@ -219,15 +219,7 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
    );

    assert_eq!(
-        input.ciphertext_modulus(),
        output.ciphertext_modulus(),
-        "Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
-        input.ciphertext_modulus(),
-        output.ciphertext_modulus(),
-    );
-
-    assert_eq!(
-        input.ciphertext_modulus(),
        accumulator.ciphertext_modulus(),
        "Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
        input.ciphertext_modulus(),
@@ -323,7 +315,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
 ///   index** in `output`.
 #[allow(clippy::too_many_arguments)]
 pub fn cuda_programmable_bootstrap_128_lwe_ciphertext<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+    input: &CudaLweCiphertextList<u64>,
    output: &mut CudaLweCiphertextList<Scalar>,
    accumulator: &CudaGlweCiphertextList<Scalar>,
    num_samples: LweCiphertextCount,
--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping_128.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping_128.rs
@@ -1,5 +1,8 @@
 pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
+use crate::shortint::parameters::DynamicDistribution;
+use crate::shortint::prelude::{DecompositionBaseLog, LweDimension};

+use crate::core_crypto::algorithms::par_allocate_and_generate_new_lwe_bootstrap_key;
 use crate::core_crypto::algorithms::test::{FftBootstrapKeys, TestResources};
 use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
@@ -7,16 +10,32 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::GpuIndex;
 use crate::core_crypto::gpu::{cuda_programmable_bootstrap_128_lwe_ciphertext, CudaStreams};

-use crate::core_crypto::keycache::KeyCacheAccess;
-use crate::core_crypto::prelude::test::{
-    NoiseSquashingTestParams, NOISESQUASHING128_U128_GPU_PARAMS,
+use crate::core_crypto::prelude::test::NoiseSquashingTestParams;
+use crate::core_crypto::prelude::{
+    allocate_and_encrypt_new_lwe_ciphertext, decrypt_lwe_ciphertext,
+    generate_programmable_bootstrap_glwe_lut, CastFrom, CastInto, DecompositionLevelCount,
+    GlweCiphertextOwned, GlweSecretKey, LweCiphertextCount, LweCiphertextOwned, LweSecretKey,
+    Plaintext, SignedDecomposer, UnsignedTorus,
 };
-use crate::core_crypto::prelude::*;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::parameters::{
+    NoiseSquashingParameters, NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
+use crate::shortint::server_key::ModulusSwitchNoiseReductionKey;
+use crate::shortint::ClassicPBSParameters;
 use serde::de::DeserializeOwned;
 use serde::Serialize;

 pub fn generate_keys<
-    Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
+    Scalar: UnsignedTorus
+        + Sync
+        + Send
+        + CastFrom<usize>
+        + CastFrom<u64>
+        + CastInto<usize>
+        + Serialize
+        + DeserializeOwned,
 >(
    params: NoiseSquashingTestParams<Scalar>,
    rsc: &mut TestResources,
@@ -52,62 +71,93 @@ pub fn generate_keys<
    }
 }

-pub fn execute_bootstrap_u128<Scalar>(params: NoiseSquashingTestParams<Scalar>)
-where
-    Scalar: Numeric
-        + UnsignedTorus
-        + CastFrom<usize>
-        + CastInto<usize>
-        + Send
-        + Sync
-        + Serialize
-        + DeserializeOwned,
-    NoiseSquashingTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
-{
-    let lwe_noise_distribution = params.glwe_noise_distribution;
-    let glwe_dimension = params.glwe_dimension;
-    let polynomial_size = params.polynomial_size;
-    let ciphertext_modulus = params.ciphertext_modulus;
+pub fn execute_bootstrap_u128(
+    squash_params: NoiseSquashingParameters,
+    input_params: ClassicPBSParameters,
+) {
+    let glwe_dimension = squash_params.glwe_dimension;
+    let polynomial_size = squash_params.polynomial_size;
+    let ciphertext_modulus = squash_params.ciphertext_modulus;

    let mut rsc = TestResources::new();

-    let mut keys_gen = |params| generate_keys(params, &mut rsc);
-    let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
+    let noise_squashing_test_params: NoiseSquashingTestParams<u128> = NoiseSquashingTestParams {
+        lwe_dimension: LweDimension(input_params.lwe_dimension.0),
+        glwe_dimension: squash_params.glwe_dimension,
+        polynomial_size: squash_params.polynomial_size,
+        lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
+        glwe_noise_distribution: squash_params.glwe_noise_distribution,
+        pbs_base_log: squash_params.decomp_base_log,
+        pbs_level: squash_params.decomp_level_count,
+        modulus_switch_noise_reduction_params: squash_params.modulus_switch_noise_reduction_params,
+        ciphertext_modulus: squash_params.ciphertext_modulus,
+    };
+
+    let mut keys_gen = |_params| generate_keys(noise_squashing_test_params, &mut rsc);
+    let keys = gen_keys_or_get_from_cache_if_enabled(noise_squashing_test_params, &mut keys_gen);
    let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
        (keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
    let output_lwe_dimension = big_lwe_sk.lwe_dimension();

+    let input_lwe_secret_key = LweSecretKey::from_container(
+        small_lwe_sk
+            .into_container()
+            .iter()
+            .copied()
+            .map(|x| x as u64)
+            .collect::<Vec<_>>(),
+    );
+
+    let mut engine = ShortintEngine::new();
+
+    let modulus_switch_noise_reduction_key = squash_params
+        .modulus_switch_noise_reduction_params
+        .map(|modulus_switch_noise_reduction_params| {
+            ModulusSwitchNoiseReductionKey::new(
+                modulus_switch_noise_reduction_params,
+                &input_lwe_secret_key,
+                &mut engine,
+                input_params.ciphertext_modulus,
+                input_params.lwe_noise_distribution,
+            )
+        });
    let gpu_index = 0;
    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
-    let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, None, &stream);
+    let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(
+        &std_bootstrapping_key,
+        modulus_switch_noise_reduction_key.as_ref(),
+        &stream,
+    );

    // Our 4 bits message space
-    let message_modulus: Scalar = Scalar::ONE << 4;
-
+    let message_modulus: u64 = 1 << 4;
    // Our input message
-    let input_message: Scalar = 3usize.cast_into();
+    let input_message: u64 = 3usize.cast_into();
+
    // Delta used to encode 4 bits of message + a bit of padding on Scalar
-    let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
+
+    let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
+    let delta_u128: u128 = (1 << (u128::BITS - 1)) / message_modulus as u128;

    // Apply our encoding
    let plaintext = Plaintext(input_message * delta);

    // Allocate a new LweCiphertext and encrypt our plaintext
-    let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
-        &small_lwe_sk,
+    let lwe_ciphertext_in: LweCiphertextOwned<u64> = allocate_and_encrypt_new_lwe_ciphertext(
+        &input_lwe_secret_key,
        plaintext,
-        lwe_noise_distribution,
-        ciphertext_modulus,
+        input_params.lwe_noise_distribution,
+        input_params.ciphertext_modulus,
        &mut rsc.encryption_random_generator,
    );

-    let f = |x: Scalar| x;
-    let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
+    let f = |x: u128| x;
+    let accumulator: GlweCiphertextOwned<u128> = generate_programmable_bootstrap_glwe_lut(
        polynomial_size,
        glwe_dimension.to_glwe_size(),
        message_modulus.cast_into(),
        ciphertext_modulus,
-        delta,
+        delta_u128,
        f,
    );

@@ -136,8 +186,7 @@ where
    let pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);

    // Decrypt the PBS result
-    let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
-
+    let pbs_plaintext: Plaintext<u128> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
    // Create a SignedDecomposer to perform the rounding of the decrypted plaintext
    // We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
    // to round the 5 MSB, 1 bit of padding plus our 4 bits of message
@@ -145,12 +194,15 @@ where
        SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));

    // Round and remove our encoding
-    let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
+    let pbs_result: u128 = signed_decomposer.closest_representable(pbs_plaintext.0) / delta_u128;

-    assert_eq!(f(input_message), pbs_result);
+    assert_eq!(f(input_message as u128), pbs_result);
 }

 #[test]
 fn test_bootstrap_u128_with_squashing() {
-    execute_bootstrap_u128::<u128>(NOISESQUASHING128_U128_GPU_PARAMS);
+    execute_bootstrap_u128(
+        NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    );
 }
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -211,7 +211,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
    streams: &CudaStreams,
    lwe_array_out: &mut CudaVec<T>,
    test_vector: &CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
+    lwe_array_in: &CudaVec<u64>,
    bootstrapping_key: &CudaVec<f64>,
    lwe_dimension: LweDimension,
    glwe_dimension: GlweDimension,
--- a/tfhe/src/high_level_api/booleans/squashed_noise.rs
+++ b/tfhe/src/high_level_api/booleans/squashed_noise.rs
@@ -4,11 +4,14 @@ use crate::backward_compatibility::booleans::{
 };
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::errors::UninitializedNoiseSquashing;
-use crate::high_level_api::global_state;
-use crate::high_level_api::global_state::with_internal_keys;
+use crate::high_level_api::global_state::{self, with_internal_keys};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
 use crate::integer::ciphertext::SquashedNoiseBooleanBlock;
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::named::Named;
 use crate::{ClientKey, Device, Tag};
 use serde::{Deserializer, Serializer};
@@ -177,9 +180,40 @@ impl SquashNoise for FheBool {
                })
            }
            #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => Err(crate::error!(
-                "Cuda devices do not support noise squashing yet"
-            )),
+            InternalServerKey::Cuda(cuda_key) => {
+                let streams = &cuda_key.streams;
+
+                let noise_squashing_key = cuda_key
+                    .key
+                    .noise_squashing_key
+                    .as_ref()
+                    .ok_or(UninitializedNoiseSquashing)?;
+
+                let cuda_block = CudaBooleanBlock(match self.ciphertext.on_gpu(streams) {
+                    MaybeCloned::Borrowed(gpu_ct) => {
+                        unsafe {
+                            // SAFETY
+                            // The gpu_ct is a ref, meaning it belongs to the thing
+                            // that is being iterated on, so it will stay alive for the
+                            // whole function
+                            gpu_ct.duplicate_async(streams)
+                        }
+                    }
+                    MaybeCloned::Cloned(gpu_ct) => gpu_ct,
+                });
+                let cuda_squashed_block = noise_squashing_key.squash_boolean_block_noise(
+                    cuda_key.pbs_key(),
+                    &cuda_block,
+                    streams,
+                )?;
+                let cpu_squashed_block =
+                    cuda_squashed_block.to_squashed_noise_boolean_block(streams);
+
+                Ok(SquashedNoiseFheBool {
+                    inner: InnerSquashedNoiseBoolean::Cpu(cpu_squashed_block),
+                    tag: cuda_key.tag.clone(),
+                })
+            }
            #[cfg(feature = "hpu")]
            InternalServerKey::Hpu(_device) => {
                Err(crate::error!("Hpu devices do not support noise squashing"))
--- a/tfhe/src/high_level_api/integers/signed/squashed_noise.rs
+++ b/tfhe/src/high_level_api/integers/signed/squashed_noise.rs
@@ -4,8 +4,7 @@ use crate::backward_compatibility::integers::{
 };
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::errors::UninitializedNoiseSquashing;
-use crate::high_level_api::global_state;
-use crate::high_level_api::global_state::with_internal_keys;
+use crate::high_level_api::global_state::{self, with_internal_keys};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
 use crate::integer::block_decomposition::{RecomposableFrom, SignExtendable};
@@ -196,9 +195,27 @@ impl<Id: FheIntId> SquashNoise for FheInt<Id> {
                })
            }
            #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => Err(crate::error!(
-                "Cuda devices do not support noise squashing yet"
-            )),
+            InternalServerKey::Cuda(cuda_key) => {
+                let streams = &cuda_key.streams;
+                let noise_squashing_key = cuda_key
+                    .key
+                    .noise_squashing_key
+                    .as_ref()
+                    .ok_or(UninitializedNoiseSquashing)?;
+
+                let cuda_squashed_ct = noise_squashing_key.squash_signed_radix_ciphertext_noise(
+                    cuda_key.pbs_key(),
+                    &self.ciphertext.on_gpu(streams),
+                    streams,
+                )?;
+
+                let cpu_squashed_ct =
+                    cuda_squashed_ct.to_squashed_noise_signed_radix_ciphertext(streams);
+                Ok(SquashedNoiseFheInt {
+                    inner: InnerSquashedNoiseSignedRadixCiphertext::Cpu(cpu_squashed_ct),
+                    tag: cuda_key.tag.clone(),
+                })
+            }
            #[cfg(feature = "hpu")]
            InternalServerKey::Hpu(_device) => {
                Err(crate::error!("Hpu devices do not support noise squashing"))
--- a/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs
@@ -5,8 +5,7 @@ use crate::backward_compatibility::integers::{
 use crate::core_crypto::commons::numeric::UnsignedNumeric;
 use crate::high_level_api::details::MaybeCloned;
 use crate::high_level_api::errors::UninitializedNoiseSquashing;
-use crate::high_level_api::global_state;
-use crate::high_level_api::global_state::with_internal_keys;
+use crate::high_level_api::global_state::{self, with_internal_keys};
 use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
 use crate::integer::block_decomposition::RecomposableFrom;
@@ -191,9 +190,26 @@ impl<Id: FheUintId> SquashNoise for FheUint<Id> {
                })
            }
            #[cfg(feature = "gpu")]
-            InternalServerKey::Cuda(_) => Err(crate::error!(
-                "Cuda devices do not support noise squashing yet"
-            )),
+            InternalServerKey::Cuda(cuda_key) => {
+                let streams = &cuda_key.streams;
+
+                let noise_squashing_key = cuda_key
+                    .key
+                    .noise_squashing_key
+                    .as_ref()
+                    .ok_or(UninitializedNoiseSquashing)?;
+
+                let cuda_squashed_ct = noise_squashing_key.squash_radix_ciphertext_noise(
+                    cuda_key.pbs_key(),
+                    &self.ciphertext.on_gpu(streams).ciphertext,
+                    streams,
+                )?;
+                let squashed_ct = cuda_squashed_ct.to_squashed_noise_radix_ciphertext(streams);
+                Ok(SquashedNoiseFheUint {
+                    inner: InnerSquashedNoiseRadixCiphertext::Cpu(squashed_ct),
+                    tag: cuda_key.tag.clone(),
+                })
+            }
            #[cfg(feature = "hpu")]
            InternalServerKey::Hpu(_device) => {
                Err(crate::error!("Hpu devices do not support noise squashing"))
--- a/tfhe/src/high_level_api/keys/inner.rs
+++ b/tfhe/src/high_level_api/keys/inner.rs
@@ -322,6 +322,8 @@ pub struct IntegerCudaServerKey {
        Option<crate::integer::gpu::list_compression::server_keys::CudaCompressionKey>,
    pub(crate) decompression_key:
        Option<crate::integer::gpu::list_compression::server_keys::CudaDecompressionKey>,
+    pub(crate) noise_squashing_key:
+        Option<crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey>,
 }

 #[derive(Clone, serde::Serialize, serde::Deserialize, Versionize)]
--- a/tfhe/src/high_level_api/keys/server.rs
+++ b/tfhe/src/high_level_api/keys/server.rs
@@ -341,6 +341,13 @@ impl CompressedServerKey {
            }
            None => None,
        };
+        let noise_squashing_key: Option<
+            crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey,
+        > = self
+            .integer_key
+            .noise_squashing_key
+            .as_ref()
+            .map(|noise_squashing_key| noise_squashing_key.decompress_to_cuda(&streams));
        synchronize_devices(streams.len() as u32);
        CudaServerKey {
            key: Arc::new(IntegerCudaServerKey {
@@ -348,6 +355,7 @@ impl CompressedServerKey {
                cpk_key_switching_key_material,
                compression_key,
                decompression_key,
+                noise_squashing_key,
            }),
            tag: self.tag.clone(),
            streams,
--- a/tfhe/src/high_level_api/tests/noise_squashing.rs
+++ b/tfhe/src/high_level_api/tests/noise_squashing.rs
@@ -8,6 +8,7 @@ use crate::shortint::parameters::{
    NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 };
+
 use rand::prelude::*;

 #[test]
@@ -29,6 +30,74 @@ fn test_noise_squashing() {

    let squashed = bitand.squash_noise().unwrap();

+    let recovered: U256 = squashed.decrypt(&cks);
+
+    assert_eq!(clear, recovered);
+
+    // Native unsigned
+    let clear: u32 = rng.gen();
+    let enc = FheUint32::encrypt(clear, &cks);
+    let bitand = &enc & &enc;
+
+    let squashed = bitand.squash_noise().unwrap();
+
+    let recovered: u32 = squashed.decrypt(&cks);
+
+    assert_eq!(clear, recovered);
+
+    // Non native signed with proper input range
+    let clear: i16 = rng.gen_range(-1 << 9..1 << 9);
+    let enc = FheInt10::encrypt(clear, &cks);
+    let bitand = &enc & &enc;
+
+    let squashed = bitand.squash_noise().unwrap();
+
+    let recovered: i16 = squashed.decrypt(&cks);
+    assert_eq!(clear, recovered);
+
+    // Native signed
+    let clear: i8 = rng.gen();
+    let enc = FheInt8::encrypt(clear, &cks);
+    let bitand = &enc & &enc;
+
+    let squashed = bitand.squash_noise().unwrap();
+
+    let recovered: i8 = squashed.decrypt(&cks);
+    assert_eq!(clear, recovered);
+
+    // Booleans
+    for clear in [false, true] {
+        let enc = FheBool::encrypt(clear, &cks);
+        let bitand = &enc & &enc;
+
+        let squashed = bitand.squash_noise().unwrap();
+
+        let recovered: bool = squashed.decrypt(&cks);
+        assert_eq!(clear, recovered);
+    }
+}
+#[cfg(feature = "gpu")]
+#[test]
+fn test_gpu_noise_squashing() {
+    let config = ConfigBuilder::with_custom_parameters(
+        crate::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    )
+    .enable_noise_squashing(NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
+    .build();
+    let cks = crate::ClientKey::generate(config);
+    let sks = cks.generate_compressed_server_key();
+
+    set_server_key(sks.decompress_to_gpu());
+
+    let mut rng = thread_rng();
+
+    // Non native type for clear
+    let clear: U256 = rng.gen();
+    let enc = FheUint256::encrypt(clear, &cks);
+    let bitand = &enc & &enc;
+
+    let squashed = bitand.squash_noise().unwrap();
+
    let recovered: U256 = squashed.decrypt(&cks);
    assert_eq!(clear, recovered);

--- a/tfhe/src/integer/gpu/ciphertext/mod.rs
+++ b/tfhe/src/integer/gpu/ciphertext/mod.rs
@@ -2,6 +2,7 @@ pub mod boolean_value;
 pub mod compact_list;
 pub mod compressed_ciphertext_list;
 pub mod info;
+pub mod squashed_noise;

 use crate::core_crypto::gpu::lwe_bootstrap_key::{
    prepare_cuda_ms_noise_reduction_key_ffi, CudaModulusSwitchNoiseReductionKey,
--- a/tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
+++ b/tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
@@ -0,0 +1,124 @@
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{LweCiphertextCount, LweCiphertextOwned, LweSize};
+use crate::integer::ciphertext::{
+    SquashedNoiseBooleanBlock, SquashedNoiseRadixCiphertext, SquashedNoiseSignedRadixCiphertext,
+};
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::shortint::ciphertext::{Degree, NoiseLevel, SquashedNoiseCiphertext};
+use crate::shortint::parameters::CoreCiphertextModulus;
+use crate::shortint::{AtomicPatternKind, CarryModulus, MessageModulus, PBSOrder};
+
+pub struct CudaSquashedNoiseRadixCiphertext {
+    pub packed_d_blocks: CudaLweCiphertextList<u128>,
+    pub info: CudaRadixCiphertextInfo,
+    pub original_block_count: usize,
+}
+
+pub struct CudaSquashedNoiseUnsignedRadixCiphertext {
+    pub ciphertext: CudaSquashedNoiseRadixCiphertext,
+}
+
+pub struct CudaSquashedNoiseSignedRadixCiphertext {
+    pub ciphertext: CudaSquashedNoiseRadixCiphertext,
+}
+
+pub struct CudaSquashedNoiseBooleanBlock {
+    pub ciphertext: CudaSquashedNoiseRadixCiphertext,
+}
+
+impl CudaSquashedNoiseRadixCiphertext {
+    pub(crate) fn new_zero(
+        lwe_size: LweSize,
+        lwe_ciphertext_count: LweCiphertextCount,
+        ciphertext_modulus: CoreCiphertextModulus<u128>,
+        message_modulus: MessageModulus,
+        carry_modulus: CarryModulus,
+        original_block_count: usize,
+        streams: &CudaStreams,
+    ) -> Self {
+        let mut blocks_info = Vec::with_capacity(lwe_ciphertext_count.0);
+
+        for _ in 0..lwe_ciphertext_count.0 {
+            blocks_info.push(CudaBlockInfo {
+                degree: Degree::new(0u64),
+                message_modulus,
+                carry_modulus,
+                atomic_pattern: AtomicPatternKind::Standard(PBSOrder::KeyswitchBootstrap),
+                noise_level: NoiseLevel::ZERO,
+            });
+        }
+        Self {
+            packed_d_blocks: CudaLweCiphertextList::<u128>::new(
+                lwe_size.to_lwe_dimension(),
+                lwe_ciphertext_count,
+                ciphertext_modulus,
+                streams,
+            ),
+            info: CudaRadixCiphertextInfo {
+                blocks: blocks_info,
+            },
+            original_block_count,
+        }
+    }
+
+    pub(crate) fn to_squashed_noise_radix_ciphertext(
+        &self,
+        streams: &CudaStreams,
+    ) -> SquashedNoiseRadixCiphertext {
+        let num_blocks = self.packed_d_blocks.lwe_ciphertext_count().0;
+        let lwe_size = self.packed_d_blocks.lwe_dimension().to_lwe_size();
+        let ct_modulus = self.packed_d_blocks.ciphertext_modulus();
+
+        let lwe_ct_list_cpu = self.packed_d_blocks.to_lwe_ciphertext_list(streams);
+        let mut packed_blocks = Vec::<SquashedNoiseCiphertext>::with_capacity(num_blocks);
+        lwe_ct_list_cpu
+            .as_ref()
+            .chunks(lwe_size.0)
+            .enumerate()
+            .for_each(|(i, block)| {
+                let block = LweCiphertextOwned::from_container(block.to_vec(), ct_modulus);
+                let info = self.info.blocks[i];
+                packed_blocks.push(SquashedNoiseCiphertext::new(
+                    block,
+                    info.degree,
+                    info.message_modulus,
+                    info.carry_modulus,
+                ));
+            });
+        SquashedNoiseRadixCiphertext {
+            packed_blocks,
+            original_block_count: self.original_block_count,
+        }
+    }
+}
+
+impl CudaSquashedNoiseSignedRadixCiphertext {
+    pub fn to_squashed_noise_signed_radix_ciphertext(
+        &self,
+        streams: &CudaStreams,
+    ) -> SquashedNoiseSignedRadixCiphertext {
+        SquashedNoiseSignedRadixCiphertext {
+            packed_blocks: self
+                .ciphertext
+                .to_squashed_noise_radix_ciphertext(streams)
+                .packed_blocks,
+            original_block_count: self.ciphertext.original_block_count,
+        }
+    }
+}
+
+impl CudaSquashedNoiseBooleanBlock {
+    pub fn to_squashed_noise_boolean_block(
+        &self,
+        streams: &CudaStreams,
+    ) -> SquashedNoiseBooleanBlock {
+        SquashedNoiseBooleanBlock {
+            ciphertext: self
+                .ciphertext
+                .to_squashed_noise_radix_ciphertext(streams)
+                .packed_blocks[0]
+                .clone(),
+        }
+    }
+}
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -2,6 +2,7 @@ pub mod ciphertext;
 pub mod client_key;
 pub mod key_switching_key;
 pub mod list_compression;
+pub mod noise_squashing;
 pub mod server_key;
 #[cfg(feature = "zk-pok")]
 pub mod zk;
@@ -5800,3 +5801,126 @@ pub unsafe fn extend_radix_with_trivial_zero_blocks_msb_async(

    update_noise_degree(output, &cuda_ffi_output);
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn noise_squashing_async<T: UnsignedInteger, B: Numeric>(
+    streams: &CudaStreams,
+    output: &mut CudaSliceMut<T>,
+    output_degrees: &mut Vec<u64>,
+    output_noise_levels: &mut Vec<u64>,
+    input: &CudaSlice<u64>,
+    bootstrapping_key: &CudaVec<f64>,
+    keyswitch_key: &CudaVec<u64>,
+    lwe_dimension: LweDimension,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    input_glwe_dimension: GlweDimension,
+    input_polynomial_size: PolynomialSize,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    original_num_blocks: u32,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
+    ct_modulus: f64,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        bootstrapping_key.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        keyswitch_key.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        keyswitch_key.gpu_index(0).get(),
+    );
+    let ms_noise_reduction_key_ffi =
+        prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
+
+    let allocate_ms_noise_array = noise_reduction_key.is_some();
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
+        output,
+        output_degrees,
+        output_noise_levels,
+        num_blocks,
+        (glwe_dimension.0 * polynomial_size.0) as u32,
+    );
+    let cuda_ffi_input = prepare_cuda_radix_ffi_from_slice(
+        input,
+        output_degrees,
+        output_noise_levels,
+        original_num_blocks,
+        (input_glwe_dimension.0 * input_polynomial_size.0) as u32,
+    );
+
+    scratch_cuda_apply_noise_squashing_kb(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        lwe_dimension.0 as u32,
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        input_glwe_dimension.0 as u32,
+        input_polynomial_size.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        original_num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        allocate_ms_noise_array,
+    );
+
+    cuda_apply_noise_squashing_kb(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+        streams.len() as u32,
+        &raw mut cuda_ffi_output,
+        &raw const cuda_ffi_input,
+        mem_ptr,
+        keyswitch_key.ptr.as_ptr(),
+        &raw const ms_noise_reduction_key_ffi,
+        bootstrapping_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_apply_noise_squashing_kb(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
--- a/tfhe/src/integer/gpu/noise_squashing/keys.rs
+++ b/tfhe/src/integer/gpu/noise_squashing/keys.rs
@@ -0,0 +1,140 @@
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::CudaStreams;
+
+use crate::core_crypto::prelude::LweCiphertextCount;
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::integer::gpu::ciphertext::squashed_noise::{
+    CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
+    CudaSquashedNoiseSignedRadixCiphertext,
+};
+use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaSignedRadixCiphertext};
+use crate::integer::gpu::CudaServerKey;
+use crate::shortint::parameters::CoreCiphertextModulus;
+
+use crate::shortint::{CarryModulus, MessageModulus};
+
+pub struct CudaNoiseSquashingKey {
+    pub bootstrapping_key: CudaLweBootstrapKey,
+    pub message_modulus: MessageModulus,
+    pub carry_modulus: CarryModulus,
+    pub output_ciphertext_modulus: CoreCiphertextModulus<u128>,
+}
+
+impl CudaNoiseSquashingKey {
+    pub fn checked_squash_ciphertext_noise(
+        &self,
+        ciphertext: &CudaRadixCiphertext,
+        src_server_key: &CudaServerKey,
+        streams: &CudaStreams,
+    ) -> crate::Result<CudaSquashedNoiseRadixCiphertext> {
+        for block in ciphertext.info.blocks.iter() {
+            if src_server_key
+                .max_noise_level
+                .validate(block.noise_level)
+                .is_err()
+            {
+                return Err(crate::error!(
+                        "squash_ciphertext_noise requires the input Ciphertext to have at most {:?} noise \
+                        got {:?}.",
+                        src_server_key.max_noise_level,
+                        block.noise_level
+                    ));
+            }
+            if block.message_modulus != self.message_modulus {
+                return Err(crate::error!(
+                    "Mismatched MessageModulus between Ciphertext {:?} and NoiseSquashingKey {:?}.",
+                    block.message_modulus,
+                    self.message_modulus,
+                ));
+            }
+            if block.carry_modulus != self.carry_modulus {
+                return Err(crate::error!(
+                    "Mismatched CarryModulus between Ciphertext {:?} and NoiseSquashingKey {:?}.",
+                    block.carry_modulus,
+                    self.carry_modulus,
+                ));
+            }
+        }
+
+        Ok(self.unchecked_squash_ciphertext_noise(ciphertext, src_server_key, streams))
+    }
+
+    pub fn unchecked_squash_ciphertext_noise(
+        &self,
+        ciphertext: &CudaRadixCiphertext,
+        src_server_key: &CudaServerKey,
+        streams: &CudaStreams,
+    ) -> CudaSquashedNoiseRadixCiphertext {
+        let original_block_count = ciphertext.d_blocks.lwe_ciphertext_count().0;
+        let packed_size = ciphertext.d_blocks.lwe_ciphertext_count().0.div_ceil(2);
+        let mut squashed_output = CudaSquashedNoiseRadixCiphertext::new_zero(
+            self.bootstrapping_key.output_lwe_dimension().to_lwe_size(),
+            LweCiphertextCount(packed_size),
+            self.output_ciphertext_modulus,
+            self.message_modulus,
+            self.carry_modulus,
+            original_block_count,
+            streams,
+        );
+
+        src_server_key.apply_noise_squashing(&mut squashed_output, ciphertext, self, streams);
+
+        let mut new_block_info = Vec::<CudaBlockInfo>::with_capacity(packed_size);
+        for (i, block) in squashed_output.info.blocks.iter().enumerate() {
+            let block_info = squashed_output.info.blocks[i];
+            new_block_info.push(CudaBlockInfo {
+                degree: block.degree,
+                message_modulus: block_info.message_modulus,
+                carry_modulus: block_info.carry_modulus,
+                atomic_pattern: block_info.atomic_pattern,
+                noise_level: block_info.noise_level,
+            });
+        }
+        CudaSquashedNoiseRadixCiphertext {
+            packed_d_blocks: squashed_output.packed_d_blocks,
+            info: CudaRadixCiphertextInfo {
+                blocks: new_block_info,
+            },
+            original_block_count,
+        }
+    }
+
+    pub fn squash_radix_ciphertext_noise(
+        &self,
+        src_server_key: &CudaServerKey,
+        ciphertext: &CudaRadixCiphertext,
+        streams: &CudaStreams,
+    ) -> crate::Result<CudaSquashedNoiseRadixCiphertext> {
+        self.checked_squash_ciphertext_noise(ciphertext, src_server_key, streams)
+    }
+
+    pub fn squash_signed_radix_ciphertext_noise(
+        &self,
+        src_server_key: &CudaServerKey,
+        ciphertext: &CudaSignedRadixCiphertext,
+        streams: &CudaStreams,
+    ) -> crate::Result<CudaSquashedNoiseSignedRadixCiphertext> {
+        let squashed_output =
+            self.checked_squash_ciphertext_noise(&ciphertext.ciphertext, src_server_key, streams)?;
+        Ok(CudaSquashedNoiseSignedRadixCiphertext {
+            ciphertext: squashed_output,
+        })
+    }
+
+    pub fn squash_boolean_block_noise(
+        &self,
+        src_server_key: &CudaServerKey,
+        ciphertext: &CudaBooleanBlock,
+        streams: &CudaStreams,
+    ) -> crate::Result<CudaSquashedNoiseBooleanBlock> {
+        let squashed_output = self.checked_squash_ciphertext_noise(
+            &ciphertext.as_ref().ciphertext,
+            src_server_key,
+            streams,
+        )?;
+        Ok(CudaSquashedNoiseBooleanBlock {
+            ciphertext: squashed_output,
+        })
+    }
+}
--- a/tfhe/src/integer/gpu/noise_squashing/mod.rs
+++ b/tfhe/src/integer/gpu/noise_squashing/mod.rs
@@ -0,0 +1,2 @@
+pub mod keys;
+pub mod noise_squashing_keys;
--- a/tfhe/src/integer/gpu/noise_squashing/noise_squashing_keys.rs
+++ b/tfhe/src/integer/gpu/noise_squashing/noise_squashing_keys.rs
@@ -0,0 +1,32 @@
+use super::keys::CudaNoiseSquashingKey;
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::integer::noise_squashing::CompressedNoiseSquashingKey;
+
+impl CompressedNoiseSquashingKey {
+    pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaNoiseSquashingKey {
+        let std_bsk = self
+            .key
+            .bootstrapping_key()
+            .as_view()
+            .par_decompress_into_lwe_bootstrap_key();
+
+        let ms_noise_reduction_key = self
+            .key
+            .modulus_switch_noise_reduction_key()
+            .as_ref()
+            .map(|key| key.decompress());
+        let bootstrapping_key = CudaLweBootstrapKey::from_lwe_bootstrap_key(
+            &std_bsk,
+            ms_noise_reduction_key.as_ref(),
+            streams,
+        );
+
+        CudaNoiseSquashingKey {
+            bootstrapping_key,
+            message_modulus: self.key.message_modulus(),
+            carry_modulus: self.key.carry_modulus(),
+            output_ciphertext_modulus: self.key.output_ciphertext_modulus(),
+        }
+    }
+}
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -8,17 +8,19 @@ use crate::core_crypto::prelude::{
 use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
 use crate::integer::gpu::ciphertext::{
    CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
    CudaUnsignedRadixCiphertext,
 };
+use crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
    add_and_propagate_single_carry_assign_async, apply_bivariate_lut_kb_async,
    apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async,
    compute_prefix_sum_hillis_steele_async, extend_radix_with_trivial_zero_blocks_msb_async,
-    full_propagate_assign_async, propagate_single_carry_assign_async, trim_radix_blocks_lsb_async,
-    CudaServerKey, PBSType,
+    full_propagate_assign_async, noise_squashing_async, propagate_single_carry_assign_async,
+    trim_radix_blocks_lsb_async, CudaServerKey, PBSType,
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
@@ -1766,4 +1768,86 @@ impl CudaServerKey {
            * size_of::<u64>()
            * ct.as_ref().d_blocks.lwe_dimension().0) as u64
    }
+
+    pub(crate) fn apply_noise_squashing(
+        &self,
+        output: &mut CudaSquashedNoiseRadixCiphertext,
+        input: &CudaRadixCiphertext,
+        squashing_key: &CudaNoiseSquashingKey,
+        streams: &CudaStreams,
+    ) {
+        unsafe {
+            self.apply_noise_squashing_async(output, input, squashing_key, streams);
+        }
+        streams.synchronize();
+    }
+
+    /// Applies the lookup table on the range of ciphertexts
+    ///
+    /// The output must have exactly block_range.len() blocks
+    ///
+    /// # Safety
+    ///
+    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
+    ///   not be dropped until streams is synchronised
+    pub(crate) unsafe fn apply_noise_squashing_async(
+        &self,
+        output: &mut CudaSquashedNoiseRadixCiphertext,
+        input: &CudaRadixCiphertext,
+        squashing_key: &CudaNoiseSquashingKey,
+        streams: &CudaStreams,
+    ) {
+        let num_output_blocks = output.packed_d_blocks.lwe_ciphertext_count().0;
+
+        let mut output_degrees = vec![0_u64; num_output_blocks];
+        let mut output_noise_levels = vec![0_u64; num_output_blocks];
+        let input_slice = input.d_blocks.0.d_vec.as_slice(.., 0).unwrap();
+        let mut output_slice = output.packed_d_blocks.0.d_vec.as_mut_slice(.., 0).unwrap();
+        let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
+        let d_bsk = &squashing_key.bootstrapping_key;
+        let (input_glwe_dimension, input_polynomial_size) = match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                (d_bsk.glwe_dimension(), d_bsk.polynomial_size())
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => (
+                d_multibit_bsk.glwe_dimension(),
+                d_multibit_bsk.polynomial_size(),
+            ),
+        };
+        unsafe {
+            noise_squashing_async::<u128, f64>(
+                streams,
+                &mut output_slice,
+                &mut output_degrees,
+                &mut output_noise_levels,
+                &input_slice,
+                &d_bsk.d_vec,
+                &self.key_switching_key.d_vec,
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                input_glwe_dimension,
+                input_polynomial_size,
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                num_output_blocks as u32,
+                input.d_blocks.lwe_ciphertext_count().0 as u32,
+                self.message_modulus,
+                self.carry_modulus,
+                PBSType::Classical,
+                LweBskGroupingFactor(0),
+                d_bsk.d_ms_noise_reduction_key.as_ref(),
+                ct_modulus,
+            );
+        }
+
+        for (i, info) in output.info.blocks.iter_mut().enumerate() {
+            info.degree = Degree(output_degrees[i]);
+            info.noise_level = NoiseLevel(output_noise_levels[i]);
+        }
+    }
 }
--- a/tfhe/src/shortint/ciphertext/squashed_noise.rs
+++ b/tfhe/src/shortint/ciphertext/squashed_noise.rs
@@ -17,6 +17,20 @@ pub struct SquashedNoiseCiphertext {
 }

 impl SquashedNoiseCiphertext {
+    #[allow(dead_code)]
+    pub(crate) fn new(
+        ct: LweCiphertextOwned<u128>,
+        degree: Degree,
+        message_modulus: MessageModulus,
+        carry_modulus: CarryModulus,
+    ) -> Self {
+        Self {
+            ct,
+            degree,
+            message_modulus,
+            carry_modulus,
+        }
+    }
    pub(crate) fn new_zero(
        lwe_size: LweSize,
        ciphertext_modulus: CoreCiphertextModulus<u128>,
--- a/tfhe/src/shortint/noise_squashing/compressed_server_key.rs
+++ b/tfhe/src/shortint/noise_squashing/compressed_server_key.rs
@@ -25,6 +25,17 @@ pub struct CompressedNoiseSquashingKey {
    output_ciphertext_modulus: CoreCiphertextModulus<u128>,
 }

+impl CompressedNoiseSquashingKey {
+    pub fn bootstrapping_key(&self) -> &SeededLweBootstrapKeyOwned<u128> {
+        &self.bootstrapping_key
+    }
+
+    pub fn modulus_switch_noise_reduction_key(
+        &self,
+    ) -> Option<&CompressedModulusSwitchNoiseReductionKey<u64>> {
+        self.modulus_switch_noise_reduction_key.as_ref()
+    }
+}
 impl ClientKey {
    pub fn new_compressed_noise_squashing_key(
        &self,