all tests are passing

fix some bugs
fix fmt
2026-01-11 15:48:20 -05:00 · 2025-03-10 18:49:37 +04:00 · 2025-03-10 15:59:48 +04:00 · 2025-03-07 18:34:41 +04:00 · 2025-03-07 18:31:07 +04:00 · 2025-03-07 17:37:13 +04:00
20 changed files with 1984 additions and 76 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -9,20 +9,29 @@
 template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_one(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t scalar_size = sizeof(Torus);
+  size_t split_count = (scalar_size == 16) ? 2 : 1;
+  return scalar_size * polynomial_size + // accumulator_rotated
+         sizeof(double) * 2 * split_count * polynomial_size /
+             2; // accumulator fft
 }
 template <typename Torus>
 uint64_t get_buffer_size_full_sm_programmable_bootstrap_step_two(
    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t scalar_size = sizeof(Torus);
+  size_t split_count = (scalar_size == 16) ? 2 : 1;
+  return scalar_size * polynomial_size + // accumulator
+         sizeof(double) * 2 * split_count * polynomial_size /
+             2; // accumulator fft
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t scalar_size = sizeof(Torus);
+  size_t split_count = (scalar_size == 16) ? 2 : 1;
+  return sizeof(double) * 2 * split_count * polynomial_size /
+         2; // accumulator fft
 }

 template <typename Torus>
@@ -215,6 +224,158 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

+template <PBS_TYPE pbs_type> struct pbs_buffer_128;
+
+template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
+  int8_t *d_mem;
+
+  __uint128_t *global_accumulator;
+  double *global_join_buffer;
+
+  PBS_VARIANT pbs_variant;
+
+  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+                 PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
+    cuda_set_device(gpu_index);
+    this->pbs_variant = pbs_variant;
+
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    if (allocate_gpu_memory) {
+      switch (pbs_variant) {
+      case PBS_VARIANT::DEFAULT: {
+        uint64_t full_sm_step_one =
+            get_buffer_size_full_sm_programmable_bootstrap_step_one<
+                __uint128_t>(polynomial_size);
+        uint64_t full_sm_step_two =
+            get_buffer_size_full_sm_programmable_bootstrap_step_two<
+                __uint128_t>(polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+                polynomial_size);
+
+        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+        uint64_t full_dm = full_sm_step_one;
+
+        uint64_t device_mem = 0;
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_two) {
+          device_mem =
+              (partial_dm_step_two + partial_dm_step_one * level_count) *
+              input_lwe_ciphertext_count * (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_one) {
+          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                       level_count * (glwe_dimension + 1);
+        }
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_join_buffer = (double *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                (polynomial_size / 2) * sizeof(double) * 4,
+            stream, gpu_index);
+
+        global_accumulator = (__uint128_t *)cuda_malloc_async(
+            (glwe_dimension + 1) * input_lwe_ciphertext_count *
+                polynomial_size * sizeof(__uint128_t),
+            stream, gpu_index);
+      } break;
+      case PBS_VARIANT::CG: {
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
+                polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_join_buffer = (double *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double) * 4,
+            stream, gpu_index);
+      } break;
+#if CUDA_ARCH >= 900
+      case PBS_VARIANT::TBC: {
+
+        bool supports_dsm =
+            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+                __uint128_t>(polynomial_size, max_shared_memory);
+
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
+                polynomial_size);
+        uint64_t minimum_sm_tbc = 0;
+        if (supports_dsm)
+          minimum_sm_tbc =
+              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                  __uint128_t>(polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_join_buffer = (double *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double) * 4,
+            stream, gpu_index);
+      } break;
+#endif
+      default:
+        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      }
+    }
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+    cuda_drop_async(d_mem, stream, gpu_index);
+    cuda_drop_async(global_join_buffer, stream, gpu_index);
+
+    if (pbs_variant == DEFAULT)
+      cuda_drop_async(global_accumulator, stream, gpu_index);
+  }
+};
+
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_cg(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -20,6 +20,11 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size);

+void cuda_convert_lwe_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
 void scratch_cuda_programmable_bootstrap_amortized_32(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -62,6 +67,11 @@ void scratch_cuda_programmable_bootstrap_64(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

+void scratch_cuda_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
@@ -80,7 +90,19 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

+void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
+
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
+
+void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
+                                             int8_t **pbs_buffer);
 }
 #endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -3,6 +3,7 @@

 #include "crypto/torus.cuh"
 #include "device.h"
+#include "fft128/f128.cuh"
 #include <cstdint>

 /**
@@ -42,6 +43,13 @@ public:
    }
  }

+  __device__ void decompose_and_compress_next_128(double *result) {
+    for (int j = 0; j < num_poly; j++) {
+      auto result_slice = result + j * params::degree / 2 * 4;
+      decompose_and_compress_next_polynomial_128(result_slice, j);
+    }
+  }
+
  // Decomposes a single polynomial
  __device__ void decompose_and_compress_next_polynomial(double2 *result,
                                                         int j) {
@@ -75,10 +83,58 @@ public:
    synchronize_threads_in_block();
  }

+  // Decomposes a single polynomial
+  __device__ void decompose_and_compress_next_polynomial_128(double *result,
+                                                             int j) {
+    uint32_t tid = threadIdx.x;
+    auto state_slice = &state[j * params::degree];
+    for (int i = 0; i < params::opt / 2; i++) {
+      auto input1 = &state_slice[tid];
+      auto input2 = &state_slice[tid + params::degree / 2];
+      T res_re = *input1 & mask_mod_b;
+      T res_im = *input2 & mask_mod_b;
+
+      *input1 >>= base_log; // Update state
+      *input2 >>= base_log; // Update state
+
+      T carry_re = ((res_re - 1ll) | *input1) & res_re;
+      T carry_im = ((res_im - 1ll) | *input2) & res_im;
+      carry_re >>= (base_log - 1);
+      carry_im >>= (base_log - 1);
+
+      *input1 += carry_re; // Update state
+      *input2 += carry_im; // Update state
+
+      res_re -= carry_re << base_log;
+      res_im -= carry_im << base_log;
+
+      auto out_re = u128_to_signed_to_f128(res_re);
+      auto out_im = u128_to_signed_to_f128(res_im);
+
+      auto out_re_hi = result + 0LL * params::degree / 2;
+      auto out_re_lo = result + 1LL * params::degree / 2;
+      auto out_im_hi = result + 2LL * params::degree / 2;
+      auto out_im_lo = result + 3LL * params::degree / 2;
+
+      out_re_hi[tid] = out_re.hi;
+      out_re_lo[tid] = out_re.lo;
+      out_im_hi[tid] = out_im.hi;
+      out_im_lo[tid] = out_im.lo;
+
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
  __device__ void decompose_and_compress_level(double2 *result, int level) {
    for (int i = 0; i < level_count - level; i++)
      decompose_and_compress_next(result);
  }
+
+  __device__ void decompose_and_compress_level_128(double *result, int level) {
+    for (int i = 0; i < level_count - level; i++)
+      decompose_and_compress_next_128(result);
+  }
 };

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -21,7 +21,7 @@ struct alignas(16) f128 {
 #else
    double s = a + b;
    return f128(s, b - (s - a));
-#endif;
+#endif
  }

  // Two-sum
@@ -270,7 +270,7 @@ __host__ __device__ inline double bits_to_double(uint64_t bits) {
  return d;
 }

-__host__ __device__ double u128_to_f64(__uint128_t x) {
+__host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const __uint128_t ONE = 1;
  const double A = ONE << 52;
  const double B = ONE << 104;
@@ -322,7 +322,7 @@ __host__ __device__ double u128_to_f64(__uint128_t x) {
  }
 }

-__host__ __device__ __uint128_t f64_to_u128(const double f) {
+__host__ __device__ inline __uint128_t f64_to_u128(const double f) {
  const __uint128_t ONE = 1;
  const uint64_t f_bits = double_to_bits(f);
  if (f_bits < 1023ull << 52) {
@@ -338,7 +338,7 @@ __host__ __device__ __uint128_t f64_to_u128(const double f) {
  }
 }

-__host__ __device__ __uint128_t f64_to_i128(const double f) {
+__host__ __device__ inline __uint128_t f64_to_i128(const double f) {
  // Get raw bits of the double
  const uint64_t f_bits = double_to_bits(f);

@@ -366,14 +366,14 @@ __host__ __device__ __uint128_t f64_to_i128(const double f) {
  return (f_bits >> 63) ? -result : result;
 }

-__host__ __device__ double i128_to_f64(__int128_t const x) {
+__host__ __device__ inline double i128_to_f64(__int128_t const x) {
  uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
  __uint128_t abs =
      (x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);

  return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
 }
-__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
+__host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
  const double first_approx = i128_to_f64(x);
  const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
  const __uint128_t first_approx_roundtrip =
@@ -387,7 +387,7 @@ __host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
  return f128(first_approx, correction);
 }

-__host__ __device__ __uint128_t u128_from_torus_f128(const f128 &a) {
+__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -287,6 +287,25 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
      &in[blockIdx.x * params::degree + params::degree / 2]);
 }

+// params is expected to be full degree not half degree
+template <class params>
+__global__ void
+batch_convert_u128_to_f128_strided_as_torus(double *d_out,
+                                            const __uint128_t *d_in) {
+
+  constexpr size_t chunk_size = params::degree / 2 * 4;
+  double *chunk = &d_out[blockIdx.x * chunk_size];
+  double *out_re_hi = &chunk[0ULL * params::degree / 2];
+  double *out_re_lo = &chunk[1ULL * params::degree / 2];
+  double *out_im_hi = &chunk[2ULL * params::degree / 2];
+  double *out_im_lo = &chunk[3ULL * params::degree / 2];
+
+  convert_u128_to_f128_as_torus<params>(
+      out_re_hi, out_re_lo, out_im_hi, out_im_lo,
+      &d_in[blockIdx.x * params::degree],
+      &d_in[blockIdx.x * params::degree + params::degree / 2]);
+}
+
 // params is expected to be full degree not half degree
 template <class params>
 __global__ void batch_convert_f128_to_u128_as_torus(__uint128_t *out,
@@ -309,7 +328,7 @@ __global__ void
 batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
                 double *in_im_lo, double *out_re_hi, double *out_re_lo,
                 double *out_im_hi, double *out_im_lo, double *buffer) {
-  extern __shared__ double sharedMemoryFFT[];
+  extern __shared__ double sharedMemoryFFT128[];
  double *re_hi, *re_lo, *im_hi, *im_lo;

  if (SMD == NOSM) {
@@ -322,10 +341,10 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
    im_lo =
        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
  } else {
-    re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
-    re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
-    im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
-    im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
+    re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
+    re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
+    im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
+    im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
  }

  Index tid = threadIdx.x;
@@ -356,6 +375,70 @@ batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
  }
 }

+template <class params, sharedMemDegree SMD>
+__global__ void batch_NSMFFT_strided_128(double *d_in, double *d_out,
+                                         double *buffer) {
+  extern __shared__ double sharedMemoryFFT128[];
+  double *re_hi, *re_lo, *im_hi, *im_lo;
+
+  if (SMD == NOSM) {
+    re_hi =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
+    re_lo =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
+    im_hi =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
+    im_lo =
+        &buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
+  } else {
+    re_hi = &sharedMemoryFFT128[params::degree / 2 * 0];
+    re_lo = &sharedMemoryFFT128[params::degree / 2 * 1];
+    im_hi = &sharedMemoryFFT128[params::degree / 2 * 2];
+    im_lo = &sharedMemoryFFT128[params::degree / 2 * 3];
+  }
+
+  constexpr size_t chunk_size = params::degree / 2 * 4;
+  double *chunk = &d_in[blockIdx.x * chunk_size];
+  double *tmp_re_hi = &chunk[0ULL * params::degree / 2];
+  double *tmp_re_lo = &chunk[1ULL * params::degree / 2];
+  double *tmp_im_hi = &chunk[2ULL * params::degree / 2];
+  double *tmp_im_lo = &chunk[3ULL * params::degree / 2];
+
+  Index tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < params::opt / 2; ++i) {
+    re_hi[tid] = tmp_re_hi[tid];
+    re_lo[tid] = tmp_re_lo[tid];
+    im_hi[tid] = tmp_im_hi[tid];
+    im_lo[tid] = tmp_im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+  if constexpr (params::fft_direction == 1) {
+    negacyclic_backward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi,
+                                                     im_lo);
+  } else {
+    negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
+  }
+  __syncthreads();
+
+  chunk = &d_out[blockIdx.x * chunk_size];
+  tmp_re_hi = &chunk[0ULL * params::degree / 2];
+  tmp_re_lo = &chunk[1ULL * params::degree / 2];
+  tmp_im_hi = &chunk[2ULL * params::degree / 2];
+  tmp_im_lo = &chunk[3ULL * params::degree / 2];
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (Index i = 0; i < params::opt / 2; ++i) {
+    tmp_re_hi[tid] = re_hi[tid];
+    tmp_re_lo[tid] = re_lo[tid];
+    tmp_im_hi[tid] = im_hi[tid];
+    tmp_im_lo[tid] = im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
 template <class params>
 __host__ void host_fourier_transform_forward_as_integer_f128(
    cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -22,6 +22,19 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
      (const int64_t *)src, polynomial_size, total_polynomials);
 }

+void cuda_convert_lwe_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size) {
+  printf("bsk transform.cu\n");
+
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_programmable_bootstrap_key_u128(
+      static_cast<cudaStream_t>(stream), gpu_index, (double *)dest,
+      (const __uint128_t *)src, polynomial_size, total_polynomials);
+}
+
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -3,6 +3,8 @@

 #include "device.h"
 #include "fft/bnsmfft.cuh"
+#include "fft128/fft128.cuh"
+
 #include "pbs/programmable_bootstrap.h"
 #include "pbs/programmable_bootstrap_multibit.h"
 #include "polynomial/parameters.cuh"
@@ -16,6 +18,13 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
         level_count;
 }

+__device__ inline int get_start_ith_ggsw_128(int i, uint32_t polynomial_size,
+                                             int glwe_dimension,
+                                             uint32_t level_count) {
+  return i * polynomial_size / 2 * 4 * (glwe_dimension + 1) *
+         (glwe_dimension + 1) * level_count;
+}
+
 ////////////////////////////////////////////////
 template <typename T>
 __device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
@@ -39,6 +48,31 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }
+
+template <typename T>
+__device__ const T *
+get_ith_mask_kth_block_128(const T *ptr, int i, int k, int level,
+                           uint32_t polynomial_size, int glwe_dimension,
+                           uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
+                                     level_count) +
+              (level_count - level - 1) * polynomial_size / 2 * 4 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
+              k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
+}
+
+template <typename T>
+__device__ T *get_ith_mask_kth_block_128(T *ptr, int i, int k, int level,
+                                         uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw_128(i, polynomial_size, glwe_dimension,
+                                     level_count) +
+              (level_count - level - 1) * polynomial_size / 2 * 4 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
+              k * polynomial_size / 2 * 4 * (glwe_dimension + 1)];
+}
+
 template <typename T>
 __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
@@ -250,5 +284,138 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
  cuda_drop_async(buffer, stream, gpu_index);
  cudaFreeHost(h_bsk);
 }
+template <int N> __global__ void dprint_array(double *a) {
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    for (int i = 0; i < N; i++)
+      printf("%.30f, ", a[i]);
+    printf("\n");
+  }
+}
+template <class params>
+void convert_and_transform_128(cudaStream_t stream, uint32_t gpu_index,
+                               double *d_bsk, __uint128_t const *d_standard,
+                               uint32_t number_of_samples) {
+
+  printf("bsk transform\n");
+  size_t required_shared_memory_size = sizeof(double) * params::degree / 2 * 4;
+  int grid_size = number_of_samples;
+  int block_size = params::degree / params::opt;
+  bool full_sm =
+      (required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
+  size_t buffer_size =
+      full_sm ? 0 : (size_t)number_of_samples * params::degree / 2 * 4;
+  size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
+  double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  // configure shared memory for batch fft kernel
+  if (full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+
+  // convert u128 into 4 x double
+  batch_convert_u128_to_f128_strided_as_torus<params>
+      <<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
+
+  // call negacyclic 128 bit forward fft.
+  if (full_sm) {
+    batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, FULLSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
+                                                                buffer);
+  } else {
+    batch_NSMFFT_strided_128<FFTDegree<params, ForwardFFT>, NOSM>
+        <<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
+                                                                buffer);
+  }
+  cuda_drop_async(buffer, stream, gpu_index);
+  for (int i = 0; i < number_of_samples; i++) {
+    auto chunk = d_bsk + i * params::degree / 2 * 4;
+    auto re_hi = chunk;
+    auto re_lo = chunk + params::degree / 2;
+    auto im_hi = chunk + 2 * params::degree / 2;
+    auto im_lo = chunk + 3 * params::degree / 2;
+
+    cudaDeviceSynchronize();
+    printf("#re_hi ");
+    cudaDeviceSynchronize();
+    dprint_array<params::degree / 2><<<1, 1>>>(re_hi);
+    cudaDeviceSynchronize();
+    printf("#re_lo");
+    cudaDeviceSynchronize();
+    dprint_array<params::degree / 2><<<1, 1>>>(re_lo);
+    cudaDeviceSynchronize();
+    printf("#im_hi");
+    cudaDeviceSynchronize();
+    dprint_array<params::degree / 2><<<1, 1>>>(im_hi);
+    cudaDeviceSynchronize();
+    printf("#im_lo");
+    cudaDeviceSynchronize();
+    dprint_array<params::degree / 2><<<1, 1>>>(im_lo);
+    cudaDeviceSynchronize();
+  }
+
+  //    cudaDeviceSynchronize();
+  //  printf("#cuda\n");
+  //  printf("#re_hi\n");
+  //  dprint_array<params::degree / 2><<<1, 1>>>(d_bsk);
+  //  cudaDeviceSynchronize();
+  //  printf("#re_lo\n");
+  //  dprint_array<params::degree / 2><<<1, 1>>>(&d_bsk[1ULL *
+  //  params::degree/2]); cudaDeviceSynchronize(); printf("#im_hi\n");
+  //  dprint_array<params::degree / 2><<<1, 1>>>(&d_bsk[2ULL *
+  //  params::degree/2]); cudaDeviceSynchronize(); printf("#im_lo\n");
+  //  dprint_array<params::degree / 2><<<1, 1>>>(&d_bsk[3ULL *
+  //  params::degree/2]); cudaDeviceSynchronize();
+}
+
+inline void cuda_convert_lwe_programmable_bootstrap_key_u128(
+    cudaStream_t stream, uint32_t gpu_index, double *dest,
+    __uint128_t const *src, uint32_t polynomial_size,
+    uint32_t total_polynomials) {
+  cuda_set_device(gpu_index);
+
+  // Here the buffer size is the size of double times the number of polynomials
+  // time 4 each polynomial is represented with 4 double array with size
+  // polynomial_size / 2 into the complex domain to perform the FFT
+  size_t buffer_size =
+      total_polynomials * polynomial_size / 2 * sizeof(double) * 4;
+
+  __uint128_t *d_standard =
+      (__uint128_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
+
+  cuda_memcpy_async_to_gpu(d_standard, src, buffer_size, stream, gpu_index);
+
+  switch (polynomial_size) {
+  case 256:
+    convert_and_transform_128<AmortizedDegree<256>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 512:
+    convert_and_transform_128<AmortizedDegree<512>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 1024:
+    convert_and_transform_128<AmortizedDegree<1024>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 2048:
+    convert_and_transform_128<AmortizedDegree<2048>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  case 4096:
+    convert_and_transform_128<AmortizedDegree<4096>>(
+        stream, gpu_index, dest, d_standard, total_polynomials);
+    break;
+  default:
+    PANIC("Cuda error (convert BSK): unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..4096].")
+  }
+
+  cuda_drop_async(d_standard, stream, gpu_index);
+}

 #endif // CNCRT_BSK_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -0,0 +1,207 @@
+#include "programmable_bootstrap_classic_128.cuh"
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
+ * the GPU in case FULLSM or PARTIALSM mode is going to be used.
+ */
+void scratch_cuda_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
+  switch (polynomial_size) {
+  case 256:
+    scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+    break;
+  default:
+    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename Torus>
+void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus const *lwe_output_indexes, Torus const *lut_vector,
+    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lwe_input_indexes, double const *bootstrapping_key,
+    pbs_buffer_128<CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_programmable_bootstrap_128<AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_programmable_bootstrap_128<AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_programmable_bootstrap_128<AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_programmable_bootstrap_128<AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_programmable_bootstrap_128<AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+/* Perform bootstrapping on a batch of input u128 LWE ciphertexts.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which lut to use for each sample in
+ * lut_vector
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the lut
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - glwe_dimension: size of the polynomial vector used to encrypt the LUT
+ * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
+ * this parameter.
+ * - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomial (~1024)
+ * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples * level_count * (glwe_dimension + 1) blocks of threads are
+ * launched, where each thread	is going to handle one or more polynomial
+ * coefficients at each stage, for a given level of decomposition, either for
+ * the LUT mask or its body:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- get the decomposition for the current level
+ * 		- switch to the FFT domain
+ * 		- multiply with the bootstrapping key
+ * 		- come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary (some
+ * synchronizations happen at the block level, some happen between blocks, using
+ * cooperative groups).
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")
+  if ((glwe_dimension + 1) * level_count > 8)
+    PANIC("Cuda error (multi-bit PBS): (k + 1)*l should be <= 8")
+
+  pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
+
+  executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
+      stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<const __uint128_t *>(lwe_output_indexes),
+      static_cast<const __uint128_t *>(lut_vector),
+      static_cast<const __uint128_t *>(lut_vector_indexes),
+      static_cast<const __uint128_t *>(lwe_array_in),
+      static_cast<const __uint128_t *>(lwe_input_indexes),
+      static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
+      glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+      num_many_lut, lut_stride);
+}
+
+/*
+ * This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
+ * bits inputs.
+ */
+void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
+                                             int8_t **buffer) {
+  auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
+  x->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -0,0 +1,688 @@
+#ifndef CUDA_PBS_CUH_128
+#define CUDA_PBS_CUH_128
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft128/fft128.cuh"
+#include "pbs/bootstrapping_key.cuh"
+#include "pbs/pbs_utilities.h"
+#include "pbs/programmable_bootstrap.h"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+
+__device__ void print_u128(__uint128_t x) {
+  int8_t digits[40];
+  int i = 0;
+
+  do {
+    digits[i] = x % 10;
+    x /= 10;
+    i++;
+  } while (x);
+
+  for (int j = i - 1; j >= 0; j--) {
+    printf("%d", digits[j]);
+  }
+}
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_programmable_bootstrap_step_one_128(
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const Torus *__restrict__ lwe_array_in,
+        const Torus *__restrict__ lwe_input_indexes,
+        const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
+        double *global_join_buffer, uint32_t lwe_iteration,
+        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, int8_t *device_mem,
+        uint64_t device_memory_size_per_block) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.z + blockIdx.y * gridDim.z +
+                      blockIdx.x * gridDim.z * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  Torus *accumulator = (Torus *)selected_memory;
+  double *accumulator_fft =
+      (double *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double *)sharedmem;
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  const Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+
+  const Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
+                  (glwe_dimension + 1)];
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+
+  double *global_fft_slice =
+      global_join_buffer + (blockIdx.y + blockIdx.z * (glwe_dimension + 1) +
+                            blockIdx.x * level_count * (glwe_dimension + 1)) *
+                               (polynomial_size / 2) * 4;
+
+  if (lwe_iteration == 0) {
+    // First iteration
+    // Put "b" in [0, 2N[
+    Torus b_hat = 0;
+    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                   params::log2_degree + 1);
+    // The y-dimension is used to select the element of the GLWE this block will
+    // compute
+    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+
+    // Persist
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      global_slice[tid] = accumulator[tid];
+      tid += params::degree / params::opt;
+    }
+
+    // debug
+    __syncthreads();
+    if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+        blockIdx.z == 0) {
+      printf("after div: ");
+      for (int j = 0; j < params::degree; j++) {
+        print_u128(accumulator[j]);
+        printf(", ");
+      }
+      printf("\n");
+      printf("b_hat: %d\n", b_hat);
+    }
+    __syncthreads();
+  }
+
+  // Put "a" in [0, 2N[
+  Torus a_hat = 0;
+  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
+                 params::log2_degree + 1); // 2 * params::log2_degree + 1);
+
+  synchronize_threads_in_block();
+
+  // Perform ACC * (X^ä - 1)
+  multiply_by_monomial_negacyclic_and_sub_polynomial<
+      Torus, params::opt, params::degree / params::opt>(global_slice,
+                                                        accumulator, a_hat);
+
+  // debug
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+      blockIdx.z == 0 && lwe_iteration == 0) {
+    printf("after mul: ");
+    for (int j = 0; j < params::degree; j++) {
+      print_u128(accumulator[j]);
+      printf(", ");
+    }
+    printf("\n");
+    printf("a_hat: %d\n", a_hat);
+  }
+  __syncthreads();
+  // Perform a rounding to increase the accuracy of the
+  // bootstrapped ciphertext
+  init_decomposer_state_inplace<Torus, params::opt,
+                                params::degree / params::opt>(
+      accumulator, base_log, level_count);
+
+  synchronize_threads_in_block();
+
+  // Decompose the accumulator. Each block gets one level of the
+  // decomposition, for the mask and the body (so block 0 will have the
+  // accumulator decomposed at level 0, 1 at 1, etc.)
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
+
+  // We are using the same memory space for accumulator_fft and
+  // accumulator_rotated, so we need to synchronize here to make sure they
+  // don't modify the same memory space at the same time
+  // Switch to the FFT space
+  auto acc_fft_re_hi = accumulator_fft + 0LL * params::degree / 2;
+  auto acc_fft_re_lo = accumulator_fft + 1LL * params::degree / 2;
+  auto acc_fft_im_hi = accumulator_fft + 2LL * params::degree / 2;
+  auto acc_fft_im_lo = accumulator_fft + 3LL * params::degree / 2;
+
+  auto global_fft_re_hi = global_fft_slice + 0LL * params::degree / 2;
+  auto global_fft_re_lo = global_fft_slice + 1LL * params::degree / 2;
+  auto global_fft_im_hi = global_fft_slice + 2LL * params::degree / 2;
+  auto global_fft_im_lo = global_fft_slice + 3LL * params::degree / 2;
+
+  // debug
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+      blockIdx.z == 0 && lwe_iteration == 0) {
+    printf("before_fft_re_hi: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_re_hi[j]);
+    }
+    printf("\n");
+
+    printf("before_fft_re_lo: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_re_lo[j]);
+    }
+    printf("\n");
+
+    printf("before_fft_im_hi: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_im_hi[j]);
+    }
+    printf("\n");
+
+    printf("before_fft_im_lo: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_im_lo[j]);
+    }
+    printf("\n");
+  }
+  __syncthreads();
+
+  negacyclic_forward_fft_f128<HalfDegree<params>>(acc_fft_re_hi, acc_fft_re_lo,
+                                                  acc_fft_im_hi, acc_fft_im_lo);
+
+  // debug
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+      blockIdx.z == 0 && lwe_iteration == 0) {
+    printf("after_fft_re_hi: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_re_hi[j]);
+    }
+    printf("\n");
+
+    printf("after_fft_re_lo: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_re_lo[j]);
+    }
+    printf("\n");
+
+    printf("after_fft_im_hi: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_im_hi[j]);
+    }
+    printf("\n");
+
+    printf("after_fft_im_lo: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_im_lo[j]);
+    }
+    printf("\n");
+  }
+  __syncthreads();
+
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    global_fft_re_hi[tid] = acc_fft_re_hi[tid];
+    global_fft_re_lo[tid] = acc_fft_re_lo[tid];
+    global_fft_im_hi[tid] = acc_fft_im_hi[tid];
+    global_fft_im_lo[tid] = acc_fft_im_lo[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void __launch_bounds__(params::degree / params::opt)
+    device_programmable_bootstrap_step_two_128(
+        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
+        const Torus *__restrict__ lut_vector,
+        const Torus *__restrict__ lut_vector_indexes,
+        const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
+        double *global_join_buffer, uint32_t lwe_iteration,
+        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+        uint32_t level_count, int8_t *device_mem,
+        uint64_t device_memory_size_per_block, uint32_t num_many_lut,
+        uint32_t lut_stride) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  // We always compute the pointer with most restrictive alignment to avoid
+  // alignment issues
+  Torus *accumulator = (Torus *)selected_memory;
+  double *accumulator_fft =
+      (double *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * params::degree / sizeof(double));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double *)sharedmem;
+
+  for (int level = 0; level < level_count; level++) {
+    double *global_fft_slice =
+        global_join_buffer + (level + blockIdx.x * level_count) *
+                                 (glwe_dimension + 1) * (params::degree / 2) *
+                                 4;
+
+    for (int j = 0; j < (glwe_dimension + 1); j++) {
+      double *fft = global_fft_slice + j * params::degree / 2 * 4;
+
+      // Get the bootstrapping key piece necessary for the multiplication
+      // It is already in the Fourier domain
+      auto bsk_slice = get_ith_mask_kth_block_128(
+          bootstrapping_key, lwe_iteration, j, level, polynomial_size,
+          glwe_dimension, level_count);
+      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2 * 4;
+
+      // debug
+      __syncthreads();
+      if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+          blockIdx.z == 0 && lwe_iteration == 0) {
+        printf("fft_re_hi: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", fft[j]);
+        }
+        printf("\n");
+
+        printf("fft_re_lo: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", fft[params::degree / 2 + j]);
+        }
+        printf("\n");
+
+        printf("fft_im_hi: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", fft[2 * params::degree / 2 + j]);
+        }
+        printf("\n");
+
+        printf("fft_im_lo: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", fft[3 * params::degree / 2 + j]);
+        }
+        printf("\n");
+      }
+      __syncthreads();
+
+      // debug
+      __syncthreads();
+      if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+          blockIdx.z == 0 && lwe_iteration == 0) {
+        printf("bsk_poly_re_hi: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", bsk_poly[j]);
+        }
+        printf("\n");
+
+        printf("bsk_poly_re_lo: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", bsk_poly[params::degree / 2 + j]);
+        }
+        printf("\n");
+
+        printf("bsk_poly_im_hi: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", bsk_poly[2 * params::degree / 2 + j]);
+        }
+        printf("\n");
+
+        printf("bsk_poly_im_lo: ");
+        for (int j = 0; j < params::degree / 2; j++) {
+          printf("%.5f, ", bsk_poly[3 * params::degree / 2 + j]);
+        }
+        printf("\n");
+      }
+      __syncthreads();
+
+      polynomial_product_accumulate_in_fourier_domain_128<params>(
+          accumulator_fft, fft, bsk_poly, !level && !j);
+    }
+  }
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+
+  // Load the persisted accumulator
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    accumulator[tid] = global_slice[tid];
+    tid += params::degree / params::opt;
+  }
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  auto acc_fft_re_hi = accumulator_fft + 0LL * params::degree / 2;
+  auto acc_fft_re_lo = accumulator_fft + 1LL * params::degree / 2;
+  auto acc_fft_im_hi = accumulator_fft + 2LL * params::degree / 2;
+  auto acc_fft_im_lo = accumulator_fft + 3LL * params::degree / 2;
+
+  // debug
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 1 &&
+      blockIdx.z == 0 && lwe_iteration == 0) {
+    printf("before_ifft_acc_fft_re_hi: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_re_hi[j]);
+    }
+    printf("\n");
+
+    printf("before_ifft_acc_fft_re_lo: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_re_lo[j]);
+    }
+    printf("\n");
+
+    printf("before_ifft_acc_fft_im_hi: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_im_hi[j]);
+    }
+    printf("\n");
+
+    printf("before_ifft_acc_fft_im_lo: ");
+    for (int j = 0; j < params::degree / 2; j++) {
+      printf("%.5f, ", acc_fft_im_lo[j]);
+    }
+    printf("\n");
+  }
+  __syncthreads();
+
+  negacyclic_backward_fft_f128<HalfDegree<params>>(
+      acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
+  add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
+                                  acc_fft_im_lo, accumulator);
+
+  if (lwe_iteration + 1 == lwe_dimension) {
+    // Last iteration
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 1, i * lut_stride);
+        }
+      }
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (num_many_lut > 1) {
+        for (int i = 1; i < num_many_lut; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.x * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
+    }
+  } else {
+    // Persist the updated accumulator
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      global_slice[tid] = accumulator[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
+template <typename params>
+__host__ void scratch_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
+          polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+          polynomial_size);
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  printf("full_sm_step_one: %llu\n", full_sm_step_one);
+  printf("full_sm_step_two: %llu\n", full_sm_step_two);
+  printf("partial_sm: %llu\n", partial_sm);
+  printf("max_shared_memory: %llu\n", max_shared_memory);
+
+  // Configure step one
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  // Configure step two
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params,
+                                                   PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  *buffer = new pbs_buffer_128<CLASSICAL>(
+      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
+}
+
+template <class params>
+__host__ void execute_step_one_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
+    __uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
+    __uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
+    __uint128_t *global_accumulator, double *global_join_buffer,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  int thds = polynomial_size / params::opt;
+  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
+
+  if (max_shared_memory < partial_sm) {
+    printf("step one NOSM\n");
+    device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM>
+        <<<grid, thds, 0, stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, full_dm);
+  } else if (max_shared_memory < full_sm) {
+    printf("step one PARTIALSM\n");
+    device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM>
+        <<<grid, thds, partial_sm, stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, partial_dm);
+  } else {
+    // printf("step one FULLSM\n");
+    device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM>
+        <<<grid, thds, full_sm, stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+template <class params>
+__host__ void execute_step_two_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
+    __uint128_t const *lut_vector_indexes, double const *bootstrapping_key,
+    __uint128_t *global_accumulator, double *global_join_buffer,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  int thds = polynomial_size / params::opt;
+  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
+
+  if (max_shared_memory < partial_sm) {
+    printf("step two NOSM\n");
+    device_programmable_bootstrap_step_two_128<__uint128_t, params, NOSM>
+        <<<grid, thds, 0, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, full_dm, num_many_lut, lut_stride);
+  } else if (max_shared_memory < full_sm) {
+    printf("step two PARTIALSM\n");
+    device_programmable_bootstrap_step_two_128<__uint128_t, params, PARTIALSM>
+        <<<grid, thds, partial_sm, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, partial_dm, num_many_lut, lut_stride);
+  } else {
+    // printf("step two FULLSM\n");
+    device_programmable_bootstrap_step_two_128<__uint128_t, params, FULLSM>
+        <<<grid, thds, full_sm, stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_join_buffer,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, 0, num_many_lut, lut_stride);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+/*
+ * Host wrapper to the programmable bootstrap 128
+ */
+template <class params>
+__host__ void host_programmable_bootstrap_128(
+    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lwe_output_indexes, __uint128_t const *lut_vector,
+    __uint128_t const *lut_vector_indexes, __uint128_t const *lwe_array_in,
+    __uint128_t const *lwe_input_indexes, double const *bootstrapping_key,
+    pbs_buffer_128<CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+  cuda_set_device(gpu_index);
+
+  // With SM each block corresponds to either the mask or body, no need to
+  // duplicate data for each
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
+          polynomial_size);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+          polynomial_size);
+
+  uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+  uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+  uint64_t full_dm_step_one = full_sm_step_one;
+  uint64_t full_dm_step_two = full_sm_step_two;
+
+  __uint128_t *global_accumulator = pbs_buffer->global_accumulator;
+  double *global_join_buffer = pbs_buffer->global_join_buffer;
+  int8_t *d_mem = pbs_buffer->d_mem;
+
+  for (int i = 0; i < lwe_dimension; i++) {
+    execute_step_one_128<params>(
+        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, global_accumulator,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+    execute_step_two_128<params>(
+        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, bootstrapping_key, global_accumulator,
+        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
+        num_many_lut, lut_stride);
+  }
+}
+
+#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_PBS_PROGRAMMABLE_BOOTSTRAP_CLASSIC_128_CUH_
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -3,6 +3,7 @@

 #include "crypto/torus.cuh"
 #include "device.h"
+#include "fft128/f128.cuh"
 #include "parameters.cuh"

 // Return A if C == 0 and B if C == 1
@@ -172,6 +173,35 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
  }
 }

+/**
+ * In case of classical PBS, this method should accumulate the result.
+ * In case of multi-bit PBS, it should overwrite.
+ */
+template <typename Torus, class params>
+__device__ void add_to_torus_128(double *re_hi, double *re_lo, double *im_hi,
+                                 double *im_lo, Torus *result,
+                                 bool overwrite_result = false) {
+  constexpr double normalization = 1. / (params::degree / 2);
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    f128 f128_real(re_hi[tid], re_lo[tid]);
+    f128 f128_imag(im_hi[tid], im_lo[tid]);
+
+    Torus torus_real = u128_from_torus_f128(f128_real);
+    Torus torus_imag = u128_from_torus_f128(f128_imag);
+
+    if (overwrite_result) {
+      result[tid] = torus_real;
+      result[tid + params::degree / 2] = torus_imag;
+    } else {
+      result[tid] += torus_real;
+      result[tid + params::degree / 2] += torus_imag;
+    }
+    tid = tid + params::degree / params::opt;
+  }
+}
+
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -56,6 +56,64 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
  }
 }

+// Computes result += first * second
+// If init_accumulator is set, assumes that result was not initialized and does
+// that with the outcome of first * second
+template <class params>
+__device__ void polynomial_product_accumulate_in_fourier_domain_128(
+    double *result, double *first, const double *second,
+    bool init_accumulator = false) {
+  int tid = threadIdx.x;
+  if (init_accumulator) {
+    for (int i = 0; i < params::opt / 2; i++) {
+      f128 a_re(first[tid + 0ULL * params::degree / 2],
+                first[tid + 1ULL * params::degree / 2]);
+      f128 a_im(first[tid + 2ULL * params::degree / 2],
+                first[tid + 3ULL * params::degree / 2]);
+      f128 b_re(second[tid + 0ULL * params::degree / 2],
+                second[tid + 1ULL * params::degree / 2]);
+      f128 b_im(second[tid + 2ULL * params::degree / 2],
+                second[tid + 3ULL * params::degree / 2]);
+      f128 c_re, c_im;
+
+      f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
+
+      result[tid + 0ULL * params::degree / 2] = c_re.hi;
+      result[tid + 1ULL * params::degree / 2] = c_re.lo;
+      result[tid + 2ULL * params::degree / 2] = c_im.hi;
+      result[tid + 3ULL * params::degree / 2] = c_im.lo;
+
+      tid += params::degree / params::opt;
+    }
+  } else {
+    for (int i = 0; i < params::opt / 2; i++) {
+      f128 a_re(first[tid + 0ULL * params::degree / 2],
+                first[tid + 1ULL * params::degree / 2]);
+      f128 a_im(first[tid + 2ULL * params::degree / 2],
+                first[tid + 3ULL * params::degree / 2]);
+      f128 b_re(second[tid + 0ULL * params::degree / 2],
+                second[tid + 1ULL * params::degree / 2]);
+      f128 b_im(second[tid + 2ULL * params::degree / 2],
+                second[tid + 3ULL * params::degree / 2]);
+      f128 res_re(result[tid + 0ULL * params::degree / 2],
+                  result[tid + 1ULL * params::degree / 2]);
+      f128 res_im(result[tid + 2ULL * params::degree / 2],
+                  result[tid + 3ULL * params::degree / 2]);
+      f128 c_re, c_im;
+
+      f128::cplx_f128_mul_assign(c_re, c_im, a_re, a_im, b_re, b_im);
+      f128::cplx_f128_add_assign(res_re, res_im, res_re, res_im, c_re, c_im);
+
+      result[tid + 0ULL * params::degree / 2] = res_re.hi;
+      result[tid + 1ULL * params::degree / 2] = res_re.lo;
+      result[tid + 2ULL * params::degree / 2] = res_im.hi;
+      result[tid + 3ULL * params::degree / 2] = res_im.lo;
+
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
 // Computes result += x
 // If init_accumulator is set, assumes that result was not initialized and does
 // that with the outcome of first * second
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1431,6 +1431,18 @@ unsafe extern "C" {
        polynomial_size: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_convert_lwe_programmable_bootstrap_key_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        dest: *mut ffi::c_void,
+        src: *const ffi::c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_programmable_bootstrap_amortized_32(
        stream: *mut ffi::c_void,
@@ -1524,6 +1536,18 @@ unsafe extern "C" {
        allocate_gpu_memory: bool,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    );
+}
 unsafe extern "C" {
    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        stream: *mut ffi::c_void,
@@ -1568,6 +1592,28 @@ unsafe extern "C" {
        lut_stride: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
 unsafe extern "C" {
    pub fn cleanup_cuda_programmable_bootstrap(
        stream: *mut ffi::c_void,
@@ -1575,6 +1621,13 @@ unsafe extern "C" {
        pbs_buffer: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn cleanup_cuda_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+}
 unsafe extern "C" {
    pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
        glwe_dimension: u32,
--- a/tfhe/src/core_crypto/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/algorithms/test/mod.rs
@@ -321,9 +321,9 @@ pub const FFT_U64_PARAMS: FftTestParams<u64> = FftTestParams {
 };

 pub const FFT_U128_PARAMS: FftTestParams<u128> = FftTestParams {
-    lwe_dimension: LweDimension(742),
+    lwe_dimension: LweDimension(5),
    glwe_dimension: GlweDimension(1),
-    polynomial_size: PolynomialSize(2048),
+    polynomial_size: PolynomialSize(256),
    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
        0.00000000004998277131225527,
    )),
@@ -348,6 +348,16 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
    ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
 };

+pub const FFT128_U128_GPU_PARAMS: FftTestParams<u128> = FftTestParams {
+    lwe_dimension: LweDimension(879),
+    glwe_dimension: GlweDimension(2),
+    polynomial_size: PolynomialSize(2048),
+    lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
+    glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
+    pbs_base_log: DecompositionBaseLog(32),
+    pbs_level: DecompositionLevelCount(3),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
 pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
    lwe_dimension: LweDimension(481),
    glwe_dimension: GlweDimension(1),
--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/bootstrap.rs
@@ -256,6 +256,7 @@ where
        ContLut: ContainerMut<Element = Scalar>,
        ContLwe: Container<Element = Scalar>,
    {
+        println!("rust#4");
        fn implementation<Scalar: UnsignedTorus + CastInto<usize>>(
            this: Fourier128LweBootstrapKey<&[f64]>,
            mut lut: GlweCiphertext<&mut [Scalar]>,
@@ -270,7 +271,7 @@ where
            let ciphertext_modulus = lut.ciphertext_modulus();
            assert!(ciphertext_modulus.is_compatible_with_native_modulus());
            let monomial_degree = pbs_modulus_switch(*lwe_body, lut_poly_size);
-
+            println!("monomial_degree: {:?}", monomial_degree);
            lut.as_mut_polynomial_list()
                .iter_mut()
                .for_each(|mut poly| {
@@ -283,6 +284,7 @@ where
            // We initialize the ct_0 used for the successive cmuxes
            let mut ct0 = lut;

+            println!("ct0_after_div: {:?}", ct0.get_body());
            for (lwe_mask_element, bootstrap_key_ggsw) in
                izip!(lwe_mask.iter(), this.into_ggsw_iter())
            {
@@ -304,11 +306,15 @@ where
                            MonomialDegree(pbs_modulus_switch(*lwe_mask_element, lut_poly_size)),
                        );
                    }
+                    // println!("ct1_after_mul: {:?}", ct1.get_body());
+                    // println!("MonomialDegree: {:?}", MonomialDegree(pbs_modulus_switch
+                    //     (*lwe_mask_element, lut_poly_size)));

                    // ct1 is re-created each loop it can be moved, ct0 is already a view, but
                    // as_mut_view is required to keep borrow rules consistent
                    cmux(&mut ct0, &mut ct1, &bootstrap_key_ggsw, fft, stack);
                }
+                break;
            }

            if !ciphertext_modulus.is_native_modulus() {
@@ -343,6 +349,8 @@ where
        ContLweIn: Container<Element = Scalar>,
        ContAcc: Container<Element = Scalar>,
    {
+        println!("rust#2");
+
        fn implementation<Scalar: UnsignedTorus + CastInto<usize>>(
            this: Fourier128LweBootstrapKey<&[f64]>,
            mut lwe_out: LweCiphertext<&mut [Scalar]>,
@@ -351,15 +359,17 @@ where
            fft: Fft128View<'_>,
            stack: &mut PodStack,
        ) {
-            // We type check dynamically with TypeId
-            #[allow(clippy::transmute_undefined_repr)]
-            if TypeId::of::<Scalar>() == TypeId::of::<u128>() {
-                let mut lwe_out: LweCiphertext<&mut [u128]> = unsafe { transmute(lwe_out) };
-                let lwe_in: LweCiphertext<&[u128]> = unsafe { transmute(lwe_in) };
-                let accumulator: GlweCiphertext<&[u128]> = unsafe { transmute(accumulator) };
+            println!("rust#3");

-                return this.bootstrap_u128(&mut lwe_out, &lwe_in, &accumulator, fft, stack);
-            }
+            // We type check dynamically with TypeId
+            // #[allow(clippy::transmute_undefined_repr)]
+            // if TypeId::of::<Scalar>() == TypeId::of::<u128>() {
+            //     let mut lwe_out: LweCiphertext<&mut [u128]> = unsafe { transmute(lwe_out) };
+            //     let lwe_in: LweCiphertext<&[u128]> = unsafe { transmute(lwe_in) };
+            //     let accumulator: GlweCiphertext<&[u128]> = unsafe { transmute(accumulator) };
+            //
+            //     return this.bootstrap_u128(&mut lwe_out, &lwe_in, &accumulator, fft, stack);
+            // }

            let (local_accumulator_data, stack) =
                stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
@@ -446,6 +456,7 @@ where
        ContLweIn: Container<Element = Scalar>,
        ContAcc: Container<Element = Scalar>,
    {
+        println!("rust#1");
        self.bootstrap(lwe_out, lwe_in, accumulator, fft.as_view(), stack);
    }

--- a/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128/crypto/ggsw.rs
@@ -328,6 +328,10 @@ where
                    fourier_im1,
                    coef_poly.as_ref(),
                );
+                println!("re0: {:?}", fourier_re0);
+                println!("re1: {:?}", fourier_re1);
+                println!("im0: {:?}", fourier_im0);
+                println!("im1: {:?}", fourier_im1);
            }
        }
        implementation(self.as_mut_view(), coef_ggsw.as_view(), fft);
@@ -461,6 +465,12 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
                    let (fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
                    let (fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
                    // We perform the forward fft transform for the glwe polynomial
+
+                    println!("before_fft_fourier_re0: {:?}", fourier_re0);
+                    println!("before_fft_fourier_re1: {:?}", fourier_re1);
+                    println!("before_fft_fourier_im0: {:?}", fourier_im0);
+                    println!("before_fft_fourier_im1: {:?}", fourier_im1);
+
                    fft.forward_as_integer(
                        fourier_re0,
                        fourier_re1,
@@ -468,6 +478,12 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
                        fourier_im1,
                        glwe_poly.as_ref(),
                    );
+
+                    println!("after_fft_fourier_re0: {:?}", fourier_re0);
+                    println!("after_fft_fourier_re1: {:?}", fourier_re1);
+                    println!("after_fft_fourier_im0: {:?}", fourier_im0);
+                    println!("after_fft_fourier_im1: {:?}", fourier_im1);
+
                    // Now we loop through the polynomials of the output, and add the
                    // corresponding product of polynomials.
                    update_with_fmadd(
@@ -503,6 +519,10 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
                output_fft_buffer_im0.into_chunks(fourier_poly_size),
                output_fft_buffer_im1.into_chunks(fourier_poly_size),
            ) {
+                println!("before_ifft_fourier_re0: {:?}", fourier_re0);
+                println!("before_ifft_fourier_re1: {:?}", fourier_re1);
+                println!("before_ifft_fourier_im0: {:?}", fourier_im0);
+                println!("before_ifft_fourier_im1: {:?}", fourier_im1);
                fft.add_backward_as_torus(
                    out.as_mut(),
                    fourier_re0,
@@ -678,6 +698,11 @@ pub fn update_with_fmadd(
        ggsw_row.data_im0.into_chunks(fourier_poly_size),
        ggsw_row.data_im1.into_chunks(fourier_poly_size),
    ) {
+        println!("ggsw_poly_re0: {:?}", ggsw_poly_re0);
+        println!("ggsw_poly_re1: {:?}", ggsw_poly_re1);
+        println!("ggsw_poly_im0: {:?}", ggsw_poly_im0);
+        println!("ggsw_poly_im1: {:?}", ggsw_poly_im1);
+
        struct Impl<'a> {
            output_fourier_re0: &'a mut [f64],
            output_fourier_re1: &'a mut [f64],
@@ -784,6 +809,7 @@ pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
        for (c1, c0) in izip!(ct1.as_mut(), ct0.as_ref()) {
            *c1 = c1.wrapping_sub(*c0);
        }
+        println!("ct1_after_mul_sub: {:?}", ct1.get_body());
        add_external_product_assign(&mut ct0, &ggsw, &ct1, fft, stack);
    }

--- a/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
+++ b/tfhe/src/core_crypto/fft_impl/fft128_u128/crypto/bootstrap.rs
@@ -174,6 +174,7 @@ where
            fft: Fft128View<'_>,
            stack: &mut PodStack,
        ) {
+            println!("rust#-1");
            let align = CACHELINE_ALIGN;
            let ciphertext_modulus = accumulator.ciphertext_modulus();

--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping_128.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping_128.rs
@@ -0,0 +1,245 @@
+pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
+
+use crate::core_crypto::algorithms::test::{
+    FftBootstrapKeys, FftTestParams, TestResources, FFT_U128_PARAMS, FFT_U32_PARAMS, FFT128_U128_GPU_PARAMS,
+};
+use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
+use crate::core_crypto::fft_impl::fft128::crypto::bootstrap::Fourier128LweBootstrapKeyOwned;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
+use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
+
+use crate::core_crypto::keycache::KeyCacheAccess;
+use crate::core_crypto::prelude::*;
+use dyn_stack::{GlobalPodBuffer, PodStack};
+use itertools::Itertools;
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+
+pub fn generate_keys<
+    Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
+>(
+    params: FftTestParams<Scalar>,
+    rsc: &mut TestResources,
+) -> FftBootstrapKeys<Scalar> {
+    // Generate an LweSecretKey with binary coefficients
+    let small_lwe_sk =
+        LweSecretKey::generate_new_binary(params.lwe_dimension, &mut rsc.secret_random_generator);
+
+    // Generate a GlweSecretKey with binary coefficients
+    let glwe_sk = GlweSecretKey::generate_new_binary(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    // Create a copy of the GlweSecretKey re-interpreted as an LweSecretKey
+    let big_lwe_sk = glwe_sk.clone().into_lwe_secret_key();
+
+    let bsk = par_allocate_and_generate_new_lwe_bootstrap_key(
+        &small_lwe_sk,
+        &glwe_sk,
+        params.pbs_base_log,
+        params.pbs_level,
+        params.glwe_noise_distribution,
+        params.ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    FftBootstrapKeys {
+        small_lwe_sk,
+        big_lwe_sk,
+        bsk,
+    }
+}
+
+pub fn execute_bootstrap_u128<Scalar, K>(params: FftTestParams<Scalar>)
+where
+    Scalar: Numeric
+        + UnsignedTorus
+        + CastFrom<usize>
+        + CastInto<usize>
+        + Send
+        + Sync
+        + Serialize
+        + DeserializeOwned,
+    K: FourierBootstrapKey<Scalar>,
+    FftTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
+{
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let glwe_dimension = params.glwe_dimension;
+    let polynomial_size = params.polynomial_size;
+    let ciphertext_modulus = params.ciphertext_modulus;
+
+    let mut rsc = TestResources::new();
+
+    let fft = K::new_fft(polynomial_size);
+
+    let mut keys_gen = |params| generate_keys(params, &mut rsc);
+    let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
+    let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
+        (keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
+    let output_lwe_dimension = big_lwe_sk.lwe_dimension();
+    let mut cnt = 0;
+    let mut level_matrix_cnt = 0;
+    for Ggsw in std_bootstrapping_key.iter() {
+        for ggsw_level_matrix in Ggsw.iter() {
+            for glwe_ciphertext in ggsw_level_matrix.as_glwe_list().iter() {
+                for polynomial in glwe_ciphertext.as_polynomial_list().iter() {
+                    for coef in polynomial.iter() {
+                        cnt += 1;
+                    }
+                }
+            }
+        }
+    }
+    // println!("std_bootstrapping_key: {:?}", std_bootstrapping_key);
+    // println!("std_bootstrapping_key: {:?}", std_bootstrapping_key.iter().len());
+    // Create the empty bootstrapping key in the Fourier domain
+    let mut fourier_bsk = K::new(
+        std_bootstrapping_key.input_lwe_dimension(),
+        std_bootstrapping_key.polynomial_size(),
+        std_bootstrapping_key.glwe_size(),
+        std_bootstrapping_key.decomposition_base_log(),
+        std_bootstrapping_key.decomposition_level_count(),
+    );
+
+    println!(
+        "decomposition_base_log: {:?}",
+        std_bootstrapping_key.decomposition_base_log()
+    );
+    println!("cnt: {:?}", cnt);
+    println!("rust transforming standard bsk");
+    fourier_bsk.fill_with_forward_fourier(
+        &std_bootstrapping_key,
+        &fft,
+        PodStack::new(&mut GlobalPodBuffer::new(
+            K::fill_with_forward_fourier_scratch(&fft).unwrap(),
+        )),
+    );
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, &stream);
+
+    // Our 4 bits message space
+    let message_modulus: Scalar = Scalar::ONE << 4;
+
+    // Our input message
+    let input_message: Scalar = 3usize.cast_into();
+    let number_of_messages = 1;
+    // Delta used to encode 4 bits of message + a bit of padding on Scalar
+    let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
+
+    // Apply our encoding
+    let plaintext = Plaintext(input_message * delta);
+
+    // Allocate a new LweCiphertext and encrypt our plaintext
+    let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
+        &small_lwe_sk,
+        plaintext,
+        lwe_noise_distribution,
+        ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    let f = |x: Scalar| x;
+    let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
+        polynomial_size,
+        glwe_dimension.to_glwe_size(),
+        message_modulus.cast_into(),
+        ciphertext_modulus,
+        delta,
+        f,
+    );
+
+    // Allocate the LweCiphertext to store the result of the PBS
+    let mut pbs_ct: LweCiphertext<Vec<Scalar>> = LweCiphertext::new(
+        Scalar::ZERO,
+        big_lwe_sk.lwe_dimension().to_lwe_size(),
+        ciphertext_modulus,
+    );
+    println!("Computing PBS...");
+
+    let d_lwe_ciphertext_in =
+        CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+    let mut d_out_pbs_ct = CudaLweCiphertextList::new(
+        output_lwe_dimension,
+        LweCiphertextCount(1),
+        ciphertext_modulus,
+        &stream,
+    );
+
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+    let mut test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
+    for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
+        *ind = <usize as CastInto<Scalar>>::cast_into(i);
+    }
+    let mut d_test_vector_indexes =
+        unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
+    unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
+
+    let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
+    let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
+    let lwe_indexes = lwe_indexes_usize
+        .iter()
+        .map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
+        .collect_vec();
+    let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+    let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+    unsafe {
+        d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+        d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+    }
+
+    cuda_programmable_bootstrap_lwe_ciphertext(
+        &d_lwe_ciphertext_in,
+        &mut d_out_pbs_ct,
+        &d_accumulator,
+        &d_test_vector_indexes,
+        &d_output_indexes,
+        &d_input_indexes,
+        LweCiphertextCount(num_blocks),
+        &d_bsk,
+        &stream,
+    );
+
+    pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
+    // fourier_bsk.bootstrap(
+    //     &mut pbs_ct,
+    //     &lwe_ciphertext_in,
+    //     &accumulator,
+    //     &fft,
+    //     PodStack::new(&mut GlobalPodBuffer::new(
+    //         K::bootstrap_scratch(
+    //             std_bootstrapping_key.glwe_size(),
+    //             std_bootstrapping_key.polynomial_size(),
+    //             &fft,
+    //         )
+    //         .unwrap(),
+    //     )),
+    // );
+
+    // Decrypt the PBS result
+    let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
+
+    // Create a SignedDecomposer to perform the rounding of the decrypted plaintext
+    // We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
+    // to round the 5 MSB, 1 bit of padding plus our 4 bits of message
+    let signed_decomposer =
+        SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));
+
+    // Round and remove our encoding
+    let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
+
+    println!("Checking result...");
+    assert_eq!(f(input_message), pbs_result);
+}
+
+#[test]
+fn test_bootstrap_u128() {
+    execute_bootstrap_u128::<u128, Fourier128LweBootstrapKeyOwned>(FFT128_U128_GPU_PARAMS);
+}
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -7,6 +7,7 @@ mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
 mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
+mod lwe_programmable_bootstrapping_128;
 mod noise_distribution;

 pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
--- a/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_bootstrap_key.rs
@@ -32,11 +32,22 @@ impl CudaLweBootstrapKey {
    where
        InputBskCont::Element: UnsignedInteger,
    {
+        println!("#1");
        let input_lwe_dimension = bsk.input_lwe_dimension();
        let polynomial_size = bsk.polynomial_size();
        let decomp_level_count = bsk.decomposition_level_count();
        let decomp_base_log = bsk.decomposition_base_log();
        let glwe_dimension = bsk.glwe_size().to_glwe_dimension();
+        let split_count = if size_of::<InputBskCont::Element>() == 16 {
+            2
+        } else {
+            1
+        };
+        println!("split_count: {:?}", split_count);
+        println!("input_lwe_dimension: {:?}", input_lwe_dimension);
+        println!("polynomial_size: {:?}", polynomial_size);
+        println!("decomp_level_count: {:?}", decomp_level_count);
+        println!("glwe_dimension: {:?}", glwe_dimension);

        // Allocate memory
        let mut d_vec = CudaVec::<f64>::new_multi_gpu(
@@ -45,10 +56,11 @@ impl CudaLweBootstrapKey {
                glwe_dimension.to_glwe_size(),
                polynomial_size,
                decomp_level_count,
-            ),
+            ) * split_count,
            streams,
        );
        // Copy to the GPU
+
        unsafe {
            convert_lwe_programmable_bootstrap_key_async(
                streams,
@@ -59,7 +71,7 @@ impl CudaLweBootstrapKey {
                decomp_level_count,
                polynomial_size,
            );
-        }
+        };
        streams.synchronize();
        Self {
            d_vec,
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -10,6 +10,7 @@ use crate::core_crypto::prelude::{
    UnsignedInteger,
 };
 pub use algorithms::*;
+use core::mem;
 pub use entities::*;
 use std::ffi::c_void;
 use tfhe_cuda_backend::bindings::*;
@@ -121,41 +122,84 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
    let num_many_lut = 1u32;
    let lut_stride = 0u32;
    let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_programmable_bootstrap_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        level.0 as u32,
-        num_samples,
-        true,
-    );
-    cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_out_indexes.as_c_ptr(0),
-        test_vector.as_c_ptr(0),
-        test_vector_indexes.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        lwe_in_indexes.as_c_ptr(0),
-        bootstrapping_key.as_c_ptr(0),
-        pbs_buffer,
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        base_log.0 as u32,
-        level.0 as u32,
-        num_samples,
-        num_many_lut,
-        lut_stride,
-    );
-    cleanup_cuda_programmable_bootstrap(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-    );
+
+    if size_of::<T>() == 16 {
+        scratch_cuda_programmable_bootstrap_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+
+        cleanup_cuda_programmable_bootstrap_128(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else {
+        scratch_cuda_programmable_bootstrap_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+        );
+
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+
+        cleanup_cuda_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    }
 }

 /// Programmable multi-bit bootstrap on a vector of LWE ciphertexts
@@ -342,18 +386,38 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
    polynomial_size: PolynomialSize,
 ) {
    let size = std::mem::size_of_val(src);
+    println!("#1.5");
    for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
-        assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
-        cuda_convert_lwe_programmable_bootstrap_key_64(
-            stream_ptr,
-            streams.gpu_indexes[i].get(),
-            dest.as_mut_c_ptr(i as u32),
-            src.as_ptr().cast(),
-            input_lwe_dim.0 as u32,
-            glwe_dim.0 as u32,
-            l_gadget.0 as u32,
-            polynomial_size.0 as u32,
-        );
+        println!("#1.6");
+        println!("dest.len(): {:?}", dest.len());
+        println!("std::mem::size_of::<T>() {:?}", std::mem::size_of::<T>());
+        println!("size: {:?}", size);
+        if size_of::<T>() == 16 {
+            println!("#2");
+
+            cuda_convert_lwe_programmable_bootstrap_key_128(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+            );
+        } else {
+            println!("#3");
+            cuda_convert_lwe_programmable_bootstrap_key_64(
+                stream_ptr,
+                streams.gpu_indexes[i].get(),
+                dest.as_mut_c_ptr(i as u32),
+                src.as_ptr().cast(),
+                input_lwe_dim.0 as u32,
+                glwe_dim.0 as u32,
+                l_gadget.0 as u32,
+                polynomial_size.0 as u32,
+            );
+        }
    }
 }
Author	SHA1	Message	Date
Beka Barbakadze	aa3f83d016	all tests are passing	2025-03-10 18:49:37 +04:00
Beka Barbakadze	320310a6b7	fix some bugs	2025-03-10 15:59:48 +04:00
Beka Barbakadze	f704a38814	fix fmt	2025-03-07 18:34:41 +04:00
Beka Barbakadze	1659c07c89	fix test	2025-03-07 18:31:07 +04:00
Beka Barbakadze	6cbe56283e	feat(gpu): add classic default 128 bit pbs	2025-03-07 17:37:13 +04:00
Beka Barbakadze	e90ec935a1	change parameters	2025-03-05 17:37:51 +04:00
Beka Barbakadze	7f3ac17cee	feat(gpu): Implement 128 bit classic pbs	2025-03-05 16:48:48 +04:00