feat(gpu): Implement 128 bit classic pbs

2026-01-09 14:47:56 -05:00 · 2025-03-05 16:48:48 +04:00
parent 8dadb626f2
commit 459969e9d2
17 changed files with 1611 additions and 66 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -9,20 +9,26 @@
 template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
+  return sizeof(Torus) * polynomial_size + // accumulator_rotated
+         sizeof(double) * 2 * double_count * polynomial_size /
+             2; // accumulator fft
 }
 template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
+  return sizeof(Torus) * polynomial_size + // accumulator
+         sizeof(double) * 2 * double_count * polynomial_size /
+             2; // accumulator fft
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
+  return sizeof(double) * 2 * double_count * polynomial_size /
+         2; // accumulator fft
 }

 template <typename Torus>
@@ -215,6 +221,155 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

+template <PBS_TYPE pbs_type> struct pbs_buffer_128;
+
+template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
+  int8_t *d_mem;
+
+  __uint128_t *global_accumulator;
+  double *global_join_buffer;
+
+  PBS_VARIANT pbs_variant;
+
+  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+                 PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
+    cuda_set_device(gpu_index);
+    this->pbs_variant = pbs_variant;
+
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
+                                     input_lwe_ciphertext_count *
+                                     polynomial_size / 2 * sizeof(double) * 4;
+
+    if (allocate_gpu_memory) {
+      switch (pbs_variant) {
+      case PBS_VARIANT::DEFAULT: {
+        uint64_t full_sm_step_one =
+            get_buffer_size_full_sm_programmable_bootstrap_step_one<
+                __uint128_t>(polynomial_size);
+        uint64_t full_sm_step_two =
+            get_buffer_size_full_sm_programmable_bootstrap_step_two<
+                __uint128_t>(polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+                polynomial_size);
+
+        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+        uint64_t full_dm = full_sm_step_one;
+
+        uint64_t device_mem = 0;
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_two) {
+          device_mem =
+              (partial_dm_step_two + partial_dm_step_one * level_count) *
+              input_lwe_ciphertext_count * (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_one) {
+          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                       level_count * (glwe_dimension + 1);
+        }
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_join_buffer = (double *)cuda_malloc_async(
+            global_join_buffer_size, stream, gpu_index);
+
+        global_accumulator = (__uint128_t *)cuda_malloc_async(
+            (glwe_dimension + 1) * input_lwe_ciphertext_count *
+                polynomial_size * sizeof(__uint128_t),
+            stream, gpu_index);
+      } break;
+      case PBS_VARIANT::CG: {
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
+                polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_join_buffer = (double *)cuda_malloc_async(
+            global_join_buffer_size, stream, gpu_index);
+      } break;
+#if CUDA_ARCH >= 900
+      case PBS_VARIANT::TBC: {
+
+        bool supports_dsm =
+            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+                __uint128_t>(polynomial_size, max_shared_memory);
+
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
+                polynomial_size);
+        uint64_t minimum_sm_tbc = 0;
+        if (supports_dsm)
+          minimum_sm_tbc =
+              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                  __uint128_t>(polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_join_buffer = (double *)cuda_malloc_async(
+            global_join_buffer_size, stream, gpu_index);
+      } break;
+#endif
+      default:
+        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      }
+    }
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+    cuda_drop_async(d_mem, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);
+
+    if (pbs_variant == DEFAULT)
+      cuda_drop_async(global_accumulator, stream, gpu_index);
+  }
+};
+
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_cg(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size);

+void cuda_convert_lwe_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
 void scratch_cuda_programmable_bootstrap_amortized_32(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -62,6 +67,11 @@ void scratch_cuda_programmable_bootstrap_64(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

+void scratch_cuda_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
@@ -80,7 +90,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

+void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
+
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
+
+void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
+                                             int8_t **pbs_buffer);
 }
 #endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -3,6 +3,7 @@

 #include "crypto/torus.cuh"
 #include "device.h"
+#include "fft128/f128.cuh"
 #include <cstdint>

 /**
@@ -42,6 +43,13 @@ public:
    }
  }

+  __device__ void decompose_and_compress_next_128(double *result) {
+    for (int j = 0; j < num_poly; j++) {
+      auto result_slice = result + j * params::degree / 2 * 4;
+      decompose_and_compress_next_polynomial_128(result_slice, j);
+    }
+  }
+
  // Decomposes a single polynomial
  __device__ void decompose_and_compress_next_polynomial(double2 *result,
                                                         int j) {
@@ -75,10 +83,58 @@ public:
    synchronize_threads_in_block();
  }

+  // Decomposes a single polynomial
+  __device__ void decompose_and_compress_next_polynomial_128(double *result,
+                                                             int j) {
+    uint32_t tid = threadIdx.x;
+    auto state_slice = &state[j * params::degree];
+    for (int i = 0; i < params::opt / 2; i++) {
+      auto input1 = &state_slice[tid];
+      auto input2 = &state_slice[tid + params::degree / 2];
+      T res_re = *input1 & mask_mod_b;
+      T res_im = *input2 & mask_mod_b;
+
+      *input1 >>= base_log; // Update state
+      *input2 >>= base_log; // Update state
+
+      T carry_re = ((res_re - 1ll) | *input1) & res_re;
+      T carry_im = ((res_im - 1ll) | *input2) & res_im;
+      carry_re >>= (base_log - 1);
+      carry_im >>= (base_log - 1);
+
+      *input1 += carry_re; // Update state
+      *input2 += carry_im; // Update state
+
+      res_re -= carry_re << base_log;
+      res_im -= carry_im << base_log;
+
+      auto out_re = u128_to_signed_to_f128(res_re);
+      auto out_im = u128_to_signed_to_f128(res_im);
+
+      auto out_re_hi = result + 0 * params::degree / 2;
+      auto out_re_lo = result + 1 * params::degree / 2;
+      auto out_im_hi = result + 2 * params::degree / 2;
+      auto out_im_lo = result + 3 * params::degree / 2;
+
+      out_re_hi[tid] = out_re.hi;
+      out_re_lo[tid] = out_re.lo;
+      out_im_hi[tid] = out_im.hi;
+      out_im_lo[tid] = out_im.lo;
+
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
  __device__ void decompose_and_compress_level(double2 *result, int level) {
    for (int i = 0; i < level_count - level; i++)
      decompose_and_compress_next(result);
  }
+
+  __device__ void decompose_and_compress_level_128(double *result, int level) {
+    for (int i = 0; i < level_count - level; i++)
+      decompose_and_compress_next_128(result);
+  }
 };

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -21,7 +21,7 @@ struct alignas(16) f128 {
 #else
    double s = a + b;
    return f128(s, b - (s - a));
-#endif;
+#endif
  }

  // Two-sum
@@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
  return d;
 }

-__host__ __device__ double u128_to_f64(__uint128_t x) {
+__host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const __uint128_t ONE = 1;
  const double A = ONE << 52;
  const double B = ONE << 104;
@@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
  }
 }

-__host__ __device__ __uint128_t f64_to_u128(const double f) {
+__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
  const __uint128_t ONE = 1;
  const uint64_t f_bits = double_to_bits(f);
  if (f_bits < 1023ull << 52) {
@@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
  }
 }

-__host__ __device__ __uint128_t f64_to_i128(const double f) {
+__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
  // Get raw bits of the double
  const uint64_t f_bits = double_to_bits(f);

@@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
  return (f_bits >> 63) ? -result : result;
 }

-__host__ __device__ double i128_to_f64(__int128_t const x) {
+__host__ __device__ inline double i128_to_f64(__int128_t const x) {
  uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
  __uint128_t abs =
      (x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);

  return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
 }
-__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
+__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
  const double first_approx = i128_to_f64(x);
  const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
  const __uint128_t first_approx_roundtrip =
@@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
  return f128(first_approx, correction);
 }

-__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
+__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -272,6 +272,9 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
+// converts standqard input into complex<128> represented by 4 double
+// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
+// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
 __global__ void
 batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
@@ -287,6 +290,29 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
      &in[blockIdx.x * params::degree + params::degree / 2]);
 }

+// params is expected to be full degree not half degree
+// converts standqard input into complex<128> represented by 4 double
+// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
+// re_lo_1, im_hi_1, im_lo_1,
+// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
+template <class params>
+__global__ void
+batch_convert_u128_to_f128_strided_as_torus(double *d_out,
+                                            const __uint128_t *d_in) {
+
+  constexpr size_t chunk_size = params::degree / 2 * 4;
+  double *chunk = &d_out[blockIdx.x * chunk_size];
+  double *out_re_hi = &chunk[0 * params::degree / 2];
+  double *out_re_lo = &chunk[1 * params::degree / 2];
+  double *out_im_hi = &chunk[2 * params::degree / 2];
+  double *out_im_lo = &chunk[3 * params::degree / 2];
+
+  convert_u128_to_f128_as_torus<params>(
+      out_re_hi, out_re_lo, out_im_hi, out_im_lo,
+      &d_in[blockIdx.x * params::degree],
+      &d_in[blockIdx.x * params::degree + params::degree / 2]);
+}
+
 // params is expected to be full degree not half degree
 template <class params>
 __global__ void batch_convert_f128_to_u128_as_torus(__uint128_t *out,
@@ -309,7 +335,7 @@ __global__ void
 batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
                 double *in_im_lo, double *out_re_hi, double *out_re_lo,
                 double *out_im_hi, double *out_im_lo, double *buffer) {
-  extern __shared__ double sharedMemoryFFT[];
+  extern __shared__ double sharedMemoryFFT128[];
  double *re_hi, *re_lo, *im_hi, *im_lo;

  if (SMD == NOSM) {
@@ -322,10 +348,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
    im_lo =
        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
  } else {
-    re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
-    re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
-    im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
-    im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
+    re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
+    re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
+    im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
+    im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
  }

  Index tid = threadIdx.x;
@@ -356,6 +382,70 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
  }
 }

+template <class params, sharedMemDegree SMD>
+__global__ void batch_NSMFFT_strided_128(double *d_in, double *d_out,
+                                         double *buffer) {
+  extern __shared__ double sharedMemoryFFT128[];
+  double *re_hi, *re_lo, *im_hi, *im_lo;
+
+  if (SMD == NOSM) {
+    re_hi =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
+    re_lo =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
+    im_hi =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
+    im_lo =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
+  } else {
+    re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
+    re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
+    im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
+    im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
+  }
+
+  constexpr size_t chunk_size = params::degree / 2 * 4;
+  double *chunk = &d_in[blockIdx.x * chunk_size];
+  double *tmp_re_hi = &chunk[0 * params::degree / 2];
+  double *tmp_re_lo = &chunk[1 * params::degree / 2];
+  double *tmp_im_hi = &chunk[2 * params::degree / 2];
+  double *tmp_im_lo = &chunk[3 * params::degree / 2];
+
+  Index tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < params::opt / 2; ++i) {
+    re_hi[tid] = tmp_re_hi[tid];
+    re_lo[tid] = tmp_re_lo[tid];
+    im_hi[tid] = tmp_im_hi[tid];
+    im_lo[tid] = tmp_im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+  if constexpr (params::fft_direction == 1) {
+    negacyclic_backward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi,
+                                                     im_lo);
+  } else {
+    negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
+  }
+  __syncthreads();
+
+  chunk = &d_out[blockIdx.x * chunk_size];
+  tmp_re_hi = &chunk[0 * params::degree / 2];
+  tmp_re_lo = &chunk[1 * params::degree / 2];
+  tmp_im_hi = &chunk[2 * params::degree / 2];
+  tmp_im_lo = &chunk[3 * params::degree / 2];
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < params::opt / 2; ++i) {
+    tmp_re_hi[tid] = re_hi[tid];
+    tmp_re_lo[tid] = re_lo[tid];
+    tmp_im_hi[tid] = im_hi[tid];
+    tmp_im_lo[tid] = im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
 template <class params>
 __host__ void host_fourier_transform_forward_as_integer_f128(
    cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -22,6 +22,18 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
      (const int64_t *)src, polynomial_size, total_polynomials);
 }

+void cuda_convert_lwe_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size) {
+
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_programmable_bootstrap_key_u128(
+      static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
+      (const __uint128_t *)src, polynomial_size, total_polynomials);
+}
+
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -3,6 +3,8 @@

 #include "device.h"
 #include "fft/bnsmfft.cuh"
+#include "fft128/fft128.cuh"
+
 #include "pbs/programmable_bootstrap.h"
 #include "pbs/programmable_bootstrap_multibit.h"
 #include "polynomial/parameters.cuh"
@@ -16,6 +18,13 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
         level_count;
 }

+__device__ inline int get_start_ith_ggsw_128(int i, uint32_t polynomial_size,
+                                             int glwe_dimension,
+                                             uint32_t level_count) {
+  return i * polynomial_size / 2 * 4 * (glwe_dimension + 1) *
+         (glwe_dimension + 1) * level_count;
+}
+
 ////////////////////////////////////////////////
 template <typename T>
 __device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
@@ -39,6 +48,31 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }
+
+template <typename T>
+__device__ const T *
+get_ith_mask_kth_block_128(const T *ptr, int i, int k, int level,
+                           uint32_t polynomial_size, int glwe_dimension,
+                           uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
+                                     level_count) +
+              (level_count - level - 1) * polynomial_size / 2 * 4 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
+              k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
+}
+
+template <typename T>
+__device__ T *get_ith_mask_kth_block_128(T *ptr, int i, int k, int level,
+                                         uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
+                                     level_count) +
+              (level_count - level - 1) * polynomial_size / 2 * 4 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
+              k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
+}
+
 template <typename T>
 __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
@@ -251,4 +285,93 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
  cudaFreeHost(h_bsk);
 }

+template <class params>
+void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
+                                              uint32_t gpu_index, double *d_bsk,
+                                              __uint128_t const *d_standard,
+                                              uint32_t number_of_samples) {
+
+  cuda_set_device(gpu_index);
+  size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
+  int grid_size = number_of_samples;
+  int block_size = params::degree / params::opt;
+  bool full_sm =
+      (required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
+  size_t buffer_size =
+      full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
+  size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
+  double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  // configure shared memory for batch fft kernel
+  if (full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  // convert u128 into 4 x double
+  batch_convert_u128_to_f128_strided_as_torus<params>
+      <<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
+
+  // call negacyclic 128 bit forward fft.
+  if (full_sm) {
+    batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
+                                                                buffer);
+  } else {
+    batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, NOSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
+                                                                buffer);
+  }
+  cuda_drop_async(buffer, stream, gpu_index);
+}
+
+inline void cuda_convert_lwe_programmable_bootstrap_key_u128(
+    cudaStream_t stream, uint32_t gpu_index, double *dest,
+    __uint128_t const *src, uint32_t polynomial_size,
+    uint32_t total_polynomials) {
+
+  // Here the buffer size is the size of double times the number of polynomials
+  // time 4 each polynomial is represented with 4 double array with size
+  // polynomial_size / 2 into the complex domain to perform the FFT
+  size_t buffer_size =
+      total_polynomials * polynomial_size / 2 * sizeof(double) * 4;
+
+  __uint128_t *d_standard =
+      (__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  cuda_memcpy_async_to_gpu(d_standard, src, buffer_size, stream, gpu_index);
+
+  switch (polynomial_size) {
+  case 256:
+    convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<256>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 512:
+    convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<512>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 1024:
+    convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<1024>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 2048:
+    convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<2048>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 4096:
+    convert_u128_to_f128_and_forward_fft_128<AmortizedDegree<4096>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  default:
+    PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..4096].")
+  }
+
+  cuda_drop_async(d_standard, stream, gpu_index);
+}
+
 #endif // CNCRT_BSK_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -0,0 +1,205 @@
+#include "programmable_bootstrap_classic_128.cuh"
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
+ * the GPU in case FULLSM or PARTIALSM mode is going to be used.
+ */
+void scratch_cuda_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
+  switch (polynomial_size) {
+  case 256:
+    scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  default:
+    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename Torus>
+void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, double const *bootstrapping_key,
+    pbs_buffer_128<CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_programmable_bootstrap_128<AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_programmable_bootstrap_128<AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_programmable_bootstrap_128<AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_programmable_bootstrap_128<AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_programmable_bootstrap_128<AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+/* Perform bootstrapping on a batch of input u128 LWE ciphertexts.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which lut to use for each sample in
+ * lut_vector
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the lut
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - glwe_dimension: size of the polynomial vector used to encrypt the LUT
+ * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
+ * this parameter.
+ * - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomial (~1024)
+ * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples * level_count * (glwe_dimension + 1) blocks of threads are
+ * launched, where each thread	is going to handle one or more polynomial
+ * coefficients at each stage, for a given level of decomposition, either for
+ * the LUT mask or its body:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- get the decomposition for the current level
+ * 		- switch to the FFT domain
+ * 		- multiply with the bootstrapping key
+ * 		- come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary (some
+ * synchronizations happen at the block level, some happen between blocks, using
+ * cooperative groups).
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")
+
+  pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
+
+  executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
+      stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<const __uint128_t *>(lwe_output_indexes),
+      static_cast<const __uint128_t *>(lut_vector),
+      static_cast<const __uint128_t *>(lut_vector_indexes),
+      static_cast<const __uint128_t *>(lwe_array_in),
+      static_cast<const __uint128_t *>(lwe_input_indexes),
+      static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
+      glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+      num_many_lut, lut_stride);
+}
+
+/*
+ * This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
+ * bits inputs.
+ */
+void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
+                                             int8_t **buffer) {
+  auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
+  x->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -0,0 +1,488 @@
+#ifndef CUDA_PBS_CUH_128
+#define CUDA_PBS_CUH_128
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft128/fft128.cuh"
+#include "pbs/bootstrapping_key.cuh"
+#include "pbs/pbs_utilities.h"
+#include "pbs/programmable_bootstrap.h"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_programmable_bootstrap_step_one_128(
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const Torus *__restrict__ lwe_array_in,
+        const Torus *__restrict__ lwe_input_indexes,
+        const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
+        double *global_join_buffer, uint32_t lwe_iteration,
+        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, int8_t *device_mem,
+        uint64_t device_memory_size_per_block) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.z + blockIdx.y * gridDim.z +
+                      blockIdx.x * gridDim.z * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  Torus *accumulator = (Torus *)selected_memory;
+  double *accumulator_fft =
+      (double *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double *)sharedmem;
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
+                  (glwe_dimension + 1)];
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+
+  double *global_fft_slice =
+      global_join_buffer + (blockIdx.y + blockIdx.z * (glwe_dimension + 1) +
+                            blockIdx.x * level_count * (glwe_dimension + 1)) *
+                               (polynomial_size / 2) * 4;
+
+  if (lwe_iteration == 0) {
+    // First iteration
+    // Put "b" in [0, 2N[
+    Torus b_hat = 0;
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);
+    // The y-dimension is used to select the element of the GLWE this block will
+    // compute
+    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+
+    // Persist
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      global_slice[tid] = accumulator[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+
+  // Put "a" in [0, 2N[
+  Torus a_hat = 0;
+  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
+                 params::log2_degree + 1); // 2 * params::log2_degree + 1);
+
+  synchronize_threads_in_block();
+
+  // Perform ACC * (X^ä - 1)
+  multiply_by_monomial_negacyclic_and_sub_polynomial<
+      Torus, params::opt, params::degree / params::opt>(global_slice,
+                                                        accumulator, a_hat);
+
+  // Perform a rounding to increase the accuracy of the
+  // bootstrapped ciphertext
+  init_decomposer_state_inplace<Torus, params::opt,
+                                params::degree / params::opt>(
+      accumulator, base_log, level_count);
+
+  synchronize_threads_in_block();
+
+  // Decompose the accumulator. Each block gets one level of the
+  // decomposition, for the mask and the body (so block 0 will have the
+  // accumulator decomposed at level 0, 1 at 1, etc.)
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
+
+  // We are using the same memory space for accumulator_fft and
+  // accumulator_rotated, so we need to synchronize here to make sure they
+  // don't modify the same memory space at the same time
+  // Switch to the FFT space
+  auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
+  auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
+  auto acc_fft_im_hi = accumulator_fft + 2 * params::degree / 2;
+  auto acc_fft_im_lo = accumulator_fft + 3 * params::degree / 2;
+
+  auto global_fft_re_hi = global_fft_slice + 0 * params::degree / 2;
+  auto global_fft_re_lo = global_fft_slice + 1 * params::degree / 2;
+  auto global_fft_im_hi = global_fft_slice + 2 * params::degree / 2;
+  auto global_fft_im_lo = global_fft_slice + 3 * params::degree / 2;
+
+  negacyclic_forward_fft_f128<HalfDegree<params>>(acc_fft_re_hi, acc_fft_re_lo,
+                                                  acc_fft_im_hi, acc_fft_im_lo);
+
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    global_fft_re_hi[tid] = acc_fft_re_hi[tid];
+    global_fft_re_lo[tid] = acc_fft_re_lo[tid];
+    global_fft_im_hi[tid] = acc_fft_im_hi[tid];
+    global_fft_im_lo[tid] = acc_fft_im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_programmable_bootstrap_step_two_128(
+        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
+        double *global_join_buffer, uint32_t lwe_iteration,
+        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, int8_t *device_mem,
+        uint64_t device_memory_size_per_block, uint32_t num_many_lut,
+        uint32_t lut_stride) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  // We always compute the pointer with most restrictive alignment to avoid
+  // alignment issues
+  Torus *accumulator = (Torus *)selected_memory;
+  double *accumulator_fft =
+      (double *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * params::degree / sizeof(double));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double *)sharedmem;
+
+  for (int level = 0; level < level_count; level++) {
+    double *global_fft_slice =
+        global_join_buffer + (level + blockIdx.x * level_count) *
+                                 (glwe_dimension + 1) * (params::degree / 2) *
+                                 4;
+
+    for (int j = 0; j < (glwe_dimension + 1); j++) {
+      double *fft = global_fft_slice + j * params::degree / 2 * 4;
+
+      // Get the bootstrapping key piece necessary for the multiplication
+      // It is already in the Fourier domain
+      auto bsk_slice = get_ith_mask_kth_block_128(
+          bootstrapping_key, lwe_iteration, j, level, polynomial_size,
+          glwe_dimension, level_count);
+      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2 * 4;
+
+      polynomial_product_accumulate_in_fourier_domain_128<params>(
+          accumulator_fft, fft, bsk_poly, !level && !j);
+    }
+  }
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+
+  // Load the persisted accumulator
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    accumulator[tid] = global_slice[tid];
+    tid += params::degree / params::opt;
+  }
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
+  auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
+  auto acc_fft_im_hi = accumulator_fft + 2 * params::degree / 2;
+  auto acc_fft_im_lo = accumulator_fft + 3 * params::degree / 2;
+
+  negacyclic_backward_fft_f128<HalfDegree<params>>(
+      acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
+
+  add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
+                                  acc_fft_im_lo, accumulator);
+
+  if (lwe_iteration + 1 == lwe_dimension) {
+    // Last iteration
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
+      }
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
+    }
+  } else {
+    // Persist the updated accumulator
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      global_slice[tid] = accumulator[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
+template <typename params>
+__host__ void scratch_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  cuda_set_device(gpu_index);
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
+          polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+          polynomial_size);
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+  // Configure step one
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  // Configure step two
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  *buffer = new pbs_buffer_128<CLASSICAL>(
+      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
+}
+
+template <class params>
+__host__ void execute_step_one_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
+    __uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
+    __uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
+    __uint128_t *global_accumulator, double *global_join_buffer,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  int thds = polynomial_size / params::opt;
+  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
+
+  if (max_shared_memory < partial_sm) {
+    device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM>
+        <<<grid, thds, 0, stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, full_dm);
+  } else if (max_shared_memory < full_sm) {
+    device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM>
+        <<<grid, thds, partial_sm, stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, partial_dm);
+  } else {
+    device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>
+        <<<grid, thds, full_sm, stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+template <class params>
+__host__ void execute_step_two_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
+    __uint128_t const *lut_vector_indexes, double const *bootstrapping_key,
+    __uint128_t *global_accumulator, double *global_join_buffer,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  int thds = polynomial_size / params::opt;
+  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
+
+  if (max_shared_memory < partial_sm) {
+    device_programmable_bootstrap_step_two_128<__uint128_t, params, NOSM>
+        <<<grid, thds, 0, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, full_dm, num_many_lut, lut_stride);
+  } else if (max_shared_memory < full_sm) {
+    device_programmable_bootstrap_step_two_128<__uint128_t, params, PARTIALSM>
+        <<<grid, thds, partial_sm, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, partial_dm, num_many_lut, lut_stride);
+  } else {
+    device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>
+        <<<grid, thds, full_sm, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, 0, num_many_lut, lut_stride);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+/*
+ * Host wrapper to the programmable bootstrap 128
+ */
+template <class params>
+__host__ void host_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
+    __uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
+    __uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
+    pbs_buffer_128<CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+  cuda_set_device(gpu_index);
+
+  // With SM each block corresponds to either the mask or body, no need to
+  // duplicate data for each
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
+          polynomial_size);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+          polynomial_size);
+
+  uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+  uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+  uint64_t full_dm_step_one = full_sm_step_one;
+  uint64_t full_dm_step_two = full_sm_step_two;
+
+  __uint128_t *global_accumulator = pbs_buffer->global_accumulator;
+  double *global_join_buffer = pbs_buffer->global_join_buffer;
+  int8_t *d_mem = pbs_buffer->d_mem;
+
+  for (int i = 0; i < lwe_dimension; i++) {
+    execute_step_one_128<params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, global_accumulator,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+    execute_step_two_128<params>(
+        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, bootstrapping_key, global_accumulator,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
+        num_many_lut, lut_stride);
+  }
+}
+
+#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_PBS_PROGRAMMABLE_BOOTSTRAP_CLASSIC_128_CUH_
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -3,6 +3,7 @@

 #include "crypto/torus.cuh"
 #include "device.h"
+#include "fft128/f128.cuh"
 #include "parameters.cuh"

 // Return A if C == 0 and B if C == 1
@@ -172,6 +173,35 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
  }
 }

+/**
+ * In case of classical PBS, this method should accumulate the result.
+ * In case of multi-bit PBS, it should overwrite.
+ */
+template <typename Torus, class params>
+__device__ void add_to_torus_128(double *re_hi, double *re_lo, double *im_hi,
+                                 double *im_lo, Torus *result,
+                                 bool overwrite_result = false) {
+  constexpr double normalization = 1. / (params::degree / 2);
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    f128 f128_real(re_hi[tid] * normalization, re_lo[tid] * normalization);
+    f128 f128_imag(im_hi[tid] * normalization, im_lo[tid] * normalization);
+
+    Torus torus_real = u128_from_torus_f128(f128_real);
+    Torus torus_imag = u128_from_torus_f128(f128_imag);
+
+    if (overwrite_result) {
+      result[tid] = torus_real;
+      result[tid + params::degree / 2] = torus_imag;
+    } else {
+      result[tid] += torus_real;
+      result[tid + params::degree / 2] += torus_imag;
+    }
+    tid = tid + params::degree / params::opt;
+  }
+}
+
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -56,6 +56,64 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
  }
 }

+// Computes result += first * second
+// If init_accumulator is set, assumes that result was not initialized and does
+// that with the outcome of first * second
+template <class params>
+__device__ void polynomial_product_accumulate_in_fourier_domain_128(
+    double *result, double *first, const double *second,
+    bool init_accumulator = false) {
+  int tid = threadIdx.x;
+  if (init_accumulator) {
+    for (int i = 0; i < params::opt / 2; i++) {
+      f128 a_re(first[tid + 0 * params::degree / 2],
+                first[tid + 1 * params::degree / 2]);
+      f128 a_im(first[tid + 2 * params::degree / 2],
+                first[tid + 3 * params::degree / 2]);
+      f128 b_re(second[tid + 0 * params::degree / 2],
+                second[tid + 1 * params::degree / 2]);
+      f128 b_im(second[tid + 2 * params::degree / 2],
+                second[tid + 3 * params::degree / 2]);
+      f128 c_re, c_im;
+
+      f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
+
+      result[tid + 0 * params::degree / 2] = c_re.hi;
+      result[tid + 1 * params::degree / 2] = c_re.lo;
+      result[tid + 2 * params::degree / 2] = c_im.hi;
+      result[tid + 3 * params::degree / 2] = c_im.lo;
+
+      tid += params::degree / params::opt;
+    }
+  } else {
+    for (int i = 0; i < params::opt / 2; i++) {
+      f128 a_re(first[tid + 0 * params::degree / 2],
+                first[tid + 1 * params::degree / 2]);
+      f128 a_im(first[tid + 2 * params::degree / 2],
+                first[tid + 3 * params::degree / 2]);
+      f128 b_re(second[tid + 0 * params::degree / 2],
+                second[tid + 1 * params::degree / 2]);
+      f128 b_im(second[tid + 2 * params::degree / 2],
+                second[tid + 3 * params::degree / 2]);
+      f128 res_re(result[tid + 0 * params::degree / 2],
+                  result[tid + 1 * params::degree / 2]);
+      f128 res_im(result[tid + 2 * params::degree / 2],
+                  result[tid + 3 * params::degree / 2]);
+      f128 c_re, c_im;
+
+      f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
+      f128::cplx_f128_add_assign(res_re, res_im, res_re, res_im, c_re, c_im);
+
+      result[tid + 0 * params::degree / 2] = res_re.hi;
+      result[tid + 1 * params::degree / 2] = res_re.lo;
+      result[tid + 2 * params::degree / 2] = res_im.hi;
+      result[tid + 3 * params::degree / 2] = res_im.lo;
+
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
 // Computes result += x
 // If init_accumulator is set, assumes that result was not initialized and does
 // that with the outcome of first * second
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1416,6 +1416,18 @@ unsafe extern "C" {
        polynomial_size: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        dest: *mut ffi::c_void,
+        src: *const ffi::c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_programmable_bootstrap_amortized_32(
        stream: *mut ffi::c_void,
@@ -1509,6 +1521,18 @@ unsafe extern "C" {
        allocate_gpu_memory: bool,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    );
+}
 unsafe extern "C" {
    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        stream: *mut ffi::c_void,
@@ -1553,6 +1577,28 @@ unsafe extern "C" {
        lut_stride: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
 unsafe extern "C" {
    pub fn cleanup_cuda_programmable_bootstrap(
        stream: *mut ffi::c_void,
@@ -1560,6 +1606,13 @@ unsafe extern "C" {
        pbs_buffer: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn cleanup_cuda_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+}
 unsafe extern "C" {
    pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
        glwe_dimension: u32,
--- a/tfhe/src/core_crypto/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/algorithms/test/mod.rs
@@ -348,6 +348,18 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
    ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
 };

+#[cfg(feature = "gpu")]
+pub const FFT128_U128_GPU_PARAMS: FftTestParams<u128> = FftTestParams {
+    lwe_dimension: LweDimension(879),
+    glwe_dimension: GlweDimension(2),
+    polynomial_size: PolynomialSize(2048),
+    lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
+    pbs_base_log: DecompositionBaseLog(32),
+    pbs_level: DecompositionLevelCount(3),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
+
 pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
    lwe_dimension: LweDimension(481),
    glwe_dimension: GlweDimension(1),
--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping_128.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping_128.rs
@@ -0,0 +1,180 @@
+pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
+
+use crate::core_crypto::algorithms::test::{
+    FftBootstrapKeys, FftTestParams, TestResources, FFT128_U128_GPU_PARAMS,
+};
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
+use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
+
+use crate::core_crypto::keycache::KeyCacheAccess;
+use crate::core_crypto::prelude::*;
+use itertools::Itertools;
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+
+pub fn generate_keys<
+    Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
+>(
+    params: FftTestParams<Scalar>,
+    rsc: &mut TestResources,
+) -> FftBootstrapKeys<Scalar> {
+    // Generate an LweSecretKey with binary coefficients
+    let small_lwe_sk =
+        LweSecretKey::generate_new_binary(params.lwe_dimension, &mut rsc.secret_random_generator);
+
+    // Generate a GlweSecretKey with binary coefficients
+    let glwe_sk = GlweSecretKey::generate_new_binary(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    // Create a copy of the GlweSecretKey re-interpreted as an LweSecretKey
+    let big_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
+
+    let bsk = par_allocate_and_generate_new_lwe_bootstrap_key(
+        &small_lwe_sk,
+        &glwe_sk,
+        params.pbs_base_log,
+        params.pbs_level,
+        params.glwe_noise_distribution,
+        params.ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    FftBootstrapKeys {
+        small_lwe_sk,
+        big_lwe_sk,
+        bsk,
+    }
+}
+
+pub fn execute_bootstrap_u128<Scalar>(params: FftTestParams<Scalar>)
+where
+    Scalar: Numeric
+        + UnsignedTorus
+        + CastFrom<usize>
+        + CastInto<usize>
+        + Send
+        + Sync
+        + Serialize
+        + DeserializeOwned,
+    FftTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
+{
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let glwe_dimension = params.glwe_dimension;
+    let polynomial_size = params.polynomial_size;
+    let ciphertext_modulus = params.ciphertext_modulus;
+
+    let mut rsc = TestResources::new();
+
+    let mut keys_gen = |params| generate_keys(params, &mut rsc);
+    let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
+    let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
+        (keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
+    let output_lwe_dimension = big_lwe_sk.lwe_dimension();
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, &stream);
+
+    // Our 4 bits message space
+    let message_modulus: Scalar = Scalar::ONE << 4;
+
+    // Our input message
+    let input_message: Scalar = 3usize.cast_into();
+    let number_of_messages = 1;
+    // Delta used to encode 4 bits of message + a bit of padding on Scalar
+    let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
+
+    // Apply our encoding
+    let plaintext = Plaintext(input_message * delta);
+
+    // Allocate a new LweCiphertext and encrypt our plaintext
+    let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
+        &small_lwe_sk,
+        plaintext,
+        lwe_noise_distribution,
+        ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    let f = |x: Scalar| x;
+    let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
+        polynomial_size,
+        glwe_dimension.to_glwe_size(),
+        message_modulus.cast_into(),
+        ciphertext_modulus,
+        delta,
+        f,
+    );
+
+    let d_lwe_ciphertext_in =
+        CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+
+    let mut d_out_pbs_ct = CudaLweCiphertextList::new(
+        output_lwe_dimension,
+        LweCiphertextCount(1),
+        ciphertext_modulus,
+        &stream,
+    );
+
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+    let mut test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
+    for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
+        *ind = <usize as CastInto<Scalar>>::cast_into(i);
+    }
+    let mut d_test_vector_indexes =
+        unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
+    unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
+
+    let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
+    let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
+    let lwe_indexes = lwe_indexes_usize
+        .iter()
+        .map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
+        .collect_vec();
+    let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+    let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+    unsafe {
+        d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+        d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+    }
+
+    cuda_programmable_bootstrap_lwe_ciphertext(
+        &d_lwe_ciphertext_in,
+        &mut d_out_pbs_ct,
+        &d_accumulator,
+        &d_test_vector_indexes,
+        &d_output_indexes,
+        &d_input_indexes,
+        LweCiphertextCount(num_blocks),
+        &d_bsk,
+        &stream,
+    );
+
+    let pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
+
+    // Decrypt the PBS result
+    let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
+
+    // Create a SignedDecomposer to perform the rounding of the decrypted plaintext
+    // We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
+    // to round the 5 MSB, 1 bit of padding plus our 4 bits of message
+    let signed_decomposer =
+        SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));
+
+    // Round and remove our encoding
+    let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
+
+    assert_eq!(f(input_message), pbs_result);
+}
+
+#[test]
+fn test_bootstrap_u128() {
+    execute_bootstrap_u128::<u128>(FFT128_U128_GPU_PARAMS);
+}
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -7,6 +7,7 @@ mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
 mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
+mod lwe_programmable_bootstrapping_128;
 mod noise_distribution;

 pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
--- a/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs
@@ -37,6 +37,11 @@ impl CudaLweBootstrapKey {
        let decomp_level_count = bsk.decomposition_level_count();
        let decomp_base_log = bsk.decomposition_base_log();
        let glwe_dimension = bsk.glwe_size().to_glwe_dimension();
+        let double_count = if size_of::<InputBskCont::Element>() == 16 {
+            2
+        } else {
+            1
+        };

        // Allocate memory
        let mut d_vec = CudaVec::<f64>::new_multi_gpu(
@@ -45,10 +50,11 @@ impl CudaLweBootstrapKey {
                glwe_dimension.to_glwe_size(),
                polynomial_size,
                decomp_level_count,
-            ),
+            ) * double_count,
            streams,
        );
        // Copy to the GPU
+
        unsafe {
            convert_lwe_programmable_bootstrap_key_async(
                streams,
@@ -59,7 +65,7 @@ impl CudaLweBootstrapKey {
                decomp_level_count,
                polynomial_size,
            );
-        }
+        };
        streams.synchronize();
        Self {
            d_vec,
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -121,41 +121,84 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
    let num_many_lut = 1u32;
    let lut_stride = 0u32;
    let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_programmable_bootstrap_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        level.0 as u32,
-        num_samples,
-        true,
-    );
-    cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_out_indexes.as_c_ptr(0),
-        test_vector.as_c_ptr(0),
-        test_vector_indexes.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        lwe_in_indexes.as_c_ptr(0),
-        bootstrapping_key.as_c_ptr(0),
-        pbs_buffer,
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        base_log.0 as u32,
-        level.0 as u32,
-        num_samples,
-        num_many_lut,
-        lut_stride,
-    );
-    cleanup_cuda_programmable_bootstrap(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-    );
+
+    if size_of::<T>() == 16 {
+        scratch_cuda_programmable_bootstrap_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+
+        cleanup_cuda_programmable_bootstrap_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else {
+        scratch_cuda_programmable_bootstrap_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+
+        cleanup_cuda_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    }
 }

 /// Programmable multi-bit bootstrap on a vector of LWE ciphertexts
@@ -341,19 +384,30 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
    l_gadget: DecompositionLevelCount,
    polynomial_size: PolynomialSize,
 ) {
-    let size = std::mem::size_of_val(src);
    for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
-        assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
-        cuda_convert_lwe_programmable_bootstrap_key_64(
-            stream_ptr,
-            streams.gpu_indexes[i].get(),
-            dest.as_mut_c_ptr(i as u32),
-            src.as_ptr().cast(),
-            input_lwe_dim.0 as u32,
-            glwe_dim.0 as u32,
-            l_gadget.0 as u32,
-            polynomial_size.0 as u32,
-        );
+        if size_of::<T>() == 16 {
+            cuda_convert_lwe_programmable_bootstrap_key_128(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+            );
+        } else {
+            cuda_convert_lwe_programmable_bootstrap_key_64(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+            );
+        }
    }
 }