chore(gpu): fix some pke bugs

chore(gpu): reduce the pool size
chore(gpu): add drift to pke gpu tests
2026-01-14 09:08:06 -05:00 · 2025-06-19 12:22:39 +02:00 · 2025-06-17 13:20:58 +02:00 · 2025-06-16 17:36:11 +02:00 · 2025-06-16 15:17:54 +02:00 · 2025-06-16 12:27:27 +02:00
52 changed files with 10753 additions and 125 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -37,6 +37,17 @@ void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
+void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
+                                      void *lwe_array_out, void *lwe_array_in,
+                                      uint32_t size, uint32_t log_modulus,
+                                      uint32_t degree,
+                                      uint32_t grouping_factor);
+
+void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
+                                       void *lwe_array_out, void *lwe_array_in,
+                                       uint32_t size, uint32_t log_modulus,
+                                       uint32_t degree,
+                                       uint32_t grouping_factor);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -40,6 +40,11 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);
-}

+void cuda_integer_extract_glwe_64(
+    void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
+    void const *glwe_list, uint32_t const glwe_index,
+    uint32_t const log_modulus, uint32_t const polynomial_size,
+    uint32_t const glwe_dimension, uint32_t const body_count);
+}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -590,6 +590,9 @@ void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
                                           uint32_t const *gpu_indexes,
                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void);
+void cuda_small_scalar_multiplication_integer_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t scalar);

 uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -65,6 +65,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
    void const *lwe_array_in, const uint64_t plaintext_in,
    const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
+void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void const *plaintext_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 }
-
 #endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -10,8 +10,8 @@ extern "C" {
 void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
                        void *lwe_array_out, const void *lwe_compact_array_in,
                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list);
+                        const void *lwe_compact_input_indexes,
+                        const void *output_body_id_per_compact_list);

 uint64_t scratch_cuda_expand_without_verification_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1,5 +1,6 @@
 #include "ciphertext.cuh"
 #include "polynomial/parameters.cuh"
+#include "torus.cuh"

 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
@@ -142,3 +143,29 @@ void cuda_glwe_sample_extract_128(
          "N's are powers of two in the interval [256..4096].")
  }
 }
+
+void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
+                                      void *lwe_array_out, void *lwe_array_in,
+                                      uint32_t size, uint32_t log_modulus,
+                                      uint32_t degree,
+                                      uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
+
+void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
+                                       void *lwe_array_out, void *lwe_array_in,
+                                       uint32_t size, uint32_t log_modulus,
+                                       uint32_t degree,
+                                       uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<__uint128_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -295,4 +295,87 @@ __host__ void host_improve_noise_modulus_switch(
  check_cuda_error(cudaGetLastError());
 }

+template <typename Torus, class params>
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
+  Torus x = 0;
+  for (int i = 0; i < grouping_factor; i++) {
+    uint32_t mask_position = grouping_factor - (i + 1);
+    int selection_bit = (ggsw_idx >> mask_position) & 1;
+    x += selection_bit * lwe_array_group[i];
+  }
+
+  return modulus_switch(x, params::log2_degree + 1);
+}
+
+template <typename Torus, class params>
+__global__ void
+modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
+                         uint32_t log_modulus, uint32_t grouping_factor) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    int num_monomials = 1 << grouping_factor;
+    int input_offset = tid * grouping_factor;
+    int output_offset = tid * (num_monomials - 1); // First monomial is skipped
+    for (int ggsw_idx = 1; ggsw_idx < num_monomials; ggsw_idx++) {
+      array_out[ggsw_idx - 1 + output_offset] =
+          calculates_monomial_degree<Torus, params>(&array_in[input_offset],
+                                                    ggsw_idx, grouping_factor);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_modulus_switch_multi_bit(
+    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
+    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
+  cudaSetDevice(gpu_index);
+  int multibit_size = size / grouping_factor;
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
+  switch (degree) {
+  case 256:
+    modulus_switch_multi_bit<Torus, Degree<256>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 512:
+    modulus_switch_multi_bit<Torus, Degree<512>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 1024:
+    modulus_switch_multi_bit<Torus, Degree<1024>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 2048:
+    modulus_switch_multi_bit<Torus, Degree<2048>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 4096:
+    modulus_switch_multi_bit<Torus, Degree<4096>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 8192:
+    modulus_switch_multi_bit<Torus, Degree<8192>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 16384:
+    modulus_switch_multi_bit<Torus, Degree<16384>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  };
+
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -89,3 +89,16 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_64(
      (int_decompression<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void cuda_integer_extract_glwe_64(
+    void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
+    void const *glwe_list, uint32_t const glwe_index,
+    uint32_t const log_modulus, uint32_t const polynomial_size,
+    uint32_t const glwe_dimension, uint32_t const body_count) {
+  auto casted_streams = (cudaStream_t *)(streams);
+
+  host_extract_mem_alloc_free<uint64_t>(
+      casted_streams[0], gpu_indexes[0], static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(glwe_list), glwe_index, log_modulus,
+      polynomial_size, glwe_dimension, body_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -231,6 +231,84 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  check_cuda_error(cudaGetLastError());
 }

+/// Extracts the glwe_index-nth GLWE ciphertext
+/// This function does not require memory allocation
+template <typename Torus>
+__host__ void host_extract_mem_alloc_free(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_array_out,
+    Torus const *array_in, const uint32_t glwe_index,
+    const uint32_t log_modulus, const uint32_t polynomial_size,
+    const uint32_t glwe_dimension, const uint32_t body_count_in) {
+  if (array_in == glwe_array_out)
+    PANIC("Cuda error: Input and output must be different");
+
+  cuda_set_device(gpu_index);
+
+  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
+  auto num_glwes = (body_count_in + polynomial_size - 1) / polynomial_size;
+
+  // Compressed length of the compressed GLWE we want to extract
+  uint32_t body_count = 0;
+  if (body_count_in % polynomial_size == 0)
+    body_count = polynomial_size;
+  else if (glwe_index == num_glwes - 1)
+    body_count = body_count_in % polynomial_size;
+  else
+    body_count = polynomial_size;
+
+  auto initial_out_len = glwe_dimension * polynomial_size + body_count;
+  // Calculates how many bits this particular GLWE shall use
+  auto number_bits_to_unpack = initial_out_len * log_modulus;
+  auto nbits = sizeof(Torus) * 8;
+  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // Calculates how many bits a full-packed GLWE shall use
+  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Uses that length to set the input pointer
+  auto chunk_array_in = array_in + glwe_index * len;
+  // Ensure the tail of the GLWE is zeroed
+  if (initial_out_len < glwe_ciphertext_size) {
+    auto zeroed_slice = glwe_array_out + initial_out_len;
+    cuda_memset_async(glwe_array_out, 0,
+                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
+                      stream, gpu_index);
+  }
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, chunk_array_in,
+                                               log_modulus, initial_out_len);
+
+  // uint32_t body_count = std::min(body_count_in, polynomial_size);
+  // auto initial_out_len = glwe_dimension * polynomial_size + body_count;
+
+  // auto compressed_glwe_accumulator_size =
+  //     (glwe_dimension + 1) * polynomial_size;
+
+  // auto number_bits_to_unpack = compressed_glwe_accumulator_size *
+  // log_modulus; auto nbits = sizeof(Torus) * 8;
+  // // number_bits_to_unpack.div_ceil(Scalar::BITS)
+  // auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // // We assure the tail of the glwe is zeroed
+  // auto zeroed_slice = glwe_array_out + initial_out_len;
+  // cuda_memset_async(zeroed_slice, 0,
+  //                   (polynomial_size - body_count) * sizeof(Torus),
+  //                   streams[0], gpu_indexes[0]);
+  // int num_blocks = 0, num_threads = 0;
+  // getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  // dim3 grid(num_blocks);
+  // dim3 threads(num_threads);
+  // extract<Torus><<<grid, threads, 0, streams[0]>>>(glwe_array_out, array_in,
+  //                                                  glwe_index, log_modulus,
+  //                                                  input_len,
+  //                                                  initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename Torus>
 __host__ void host_integer_decompress(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -139,3 +139,12 @@ void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(

  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
 }
+
+void cuda_small_scalar_multiplication_integer_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t scalar) {
+
+  host_integer_small_scalar_mul_radix<uint64_t>((cudaStream_t *)(streams),
+                                                gpu_indexes, gpu_count,
+                                                lwe_array, lwe_array, scalar);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -147,3 +147,44 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
      static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
      input_lwe_dimension, input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the subtraction of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext vector.
+ * - `stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_array_in` is the plaintext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of plaintexts in the input plaintext vector.
+ *
+ * Each plaintext of the input plaintext vector is subtracted to the body of the
+ * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void const *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_subtraction_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -271,7 +271,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename T>
-__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
+__global__ void radix_body_subtraction_inplace(T *lwe_ct,
+                                               const T *plaintext_input,
                                               uint32_t input_lwe_dimension,
                                               uint32_t num_entries) {

@@ -288,7 +289,7 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
 template <typename T>
 __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                         uint32_t gpu_index, T *output,
-                                         T *lwe_input, T *plaintext_input,
+                                         T *lwe_input, const T *plaintext_input,
                                         uint32_t input_lwe_dimension,
                                         uint32_t input_lwe_ciphertext_count) {

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -141,7 +141,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
    PANIC("Cuda error (classical PBS): base log should be <= 64")

  // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key->num_zeros != 0) {
+  if (buffer->uses_noise_reduction) {
    uint32_t log_modulus = log2(polynomial_size) + 1;
    host_improve_noise_modulus_switch<InputTorus>(
        static_cast<cudaStream_t>(stream), gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,20 +18,6 @@
 #include "types/complex/operations.cuh"
 #include <vector>

-template <typename Torus, class params>
-__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
-                                               uint32_t ggsw_idx,
-                                               uint32_t grouping_factor) {
-  Torus x = 0;
-  for (int i = 0; i < grouping_factor; i++) {
-    uint32_t mask_position = grouping_factor - (i + 1);
-    int selection_bit = (ggsw_idx >> mask_position) & 1;
-    x += selection_bit * lwe_array_group[i];
-  }
-
-  return modulus_switch(x, params::log2_degree + 1);
-}
-
 __device__ __forceinline__ int
 get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
                          uint32_t level_count) {
--- a/backends/tfhe-cuda-backend/cuda/src/zk/expand.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/expand.cu
@@ -3,8 +3,8 @@
 void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
                        void *lwe_array_out, const void *lwe_compact_array_in,
                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list) {
+                        const void *lwe_compact_input_indexes,
+                        const void *output_body_id_per_compact_list) {

  switch (lwe_dimension) {
  case 256:
@@ -12,49 +12,56 @@ void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 512:
    host_lwe_expand<uint64_t, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 1024:
    host_lwe_expand<uint64_t, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 2048:
    host_lwe_expand<uint64_t, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 4096:
    host_lwe_expand<uint64_t, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 8192:
    host_lwe_expand<uint64_t, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 16384:
    host_lwe_expand<uint64_t, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  default:
    PANIC("CUDA error: lwe_dimension not supported."
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -34,7 +34,8 @@ __host__ void host_expand_without_verification(
  if (sizeof(Torus) == 8) {
    cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
                       lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
-                       d_lwe_compact_input_indexes, d_body_id_per_compact_list);
+                       (void *)d_lwe_compact_input_indexes,
+                       (void *)d_body_id_per_compact_list);

  } else
    PANIC("Cuda error: expand is only supported on 64 bits")
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -74,6 +74,30 @@ unsafe extern "C" {
        polynomial_size: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_modulus_switch_multi_bit_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *mut ffi::c_void,
+        size: u32,
+        log_modulus: u32,
+        degree: u32,
+        grouping_factor: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_modulus_switch_multi_bit_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *mut ffi::c_void,
+        size: u32,
+        log_modulus: u32,
+        degree: u32,
+        grouping_factor: u32,
+    );
+}
 pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
 pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
 pub type PBS_TYPE = ffi::c_uint;
@@ -189,6 +213,19 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_integer_extract_glwe_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        lwe_array_out: *mut ffi::c_void,
+        glwe_list: *const ffi::c_void,
+        glwe_index: u32,
+        log_modulus: u32,
+        polynomial_size: u32,
+        glwe_dimension: u32,
+        body_count: u32,
+    );
+}
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
 pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
@@ -1425,6 +1462,15 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_small_scalar_multiplication_integer_64_inplace(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array: *mut CudaRadixCiphertextFFI,
+        scalar: u64,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
        streams: *const *mut ffi::c_void,
@@ -1485,8 +1531,8 @@ unsafe extern "C" {
        lwe_compact_array_in: *const ffi::c_void,
        lwe_dimension: u32,
        num_lwe: u32,
-        lwe_compact_input_indexes: *const u32,
-        output_body_id_per_compact_list: *const u32,
+        lwe_compact_input_indexes: *const ffi::c_void,
+        output_body_id_per_compact_list: *const ffi::c_void,
    );
 }
 unsafe extern "C" {
@@ -1770,6 +1816,17 @@ unsafe extern "C" {
        input_lwe_ciphertext_count: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *mut ffi::c_void,
+        plaintext_array_in: *const ffi::c_void,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+}
 unsafe extern "C" {
    pub fn cuda_fourier_transform_forward_as_torus_f128_async(
        stream: *mut ffi::c_void,
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -489,6 +489,7 @@ fn hpu_bench_transfer_throughput<FheType, F>(

 #[cfg(not(any(feature = "gpu", feature = "hpu")))]
 fn main() {
+    #[cfg(feature = "pbs-stats")]
    use crate::pbs_stats::print_transfer_pbs_counts;
    let params = benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

@@ -607,10 +608,11 @@ fn main() {
    c.final_summary();
 }

-#[cfg(feature = "gpu")]
 fn main() {
+    #[cfg(feature = "pbs-stats")]
    use crate::pbs_stats::print_transfer_pbs_counts;
    let params = benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let params = benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

    let config = tfhe::ConfigBuilder::with_custom_parameters(params).build();
    let cks = ClientKey::generate(config);
--- a/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
+++ b/tfhe/docs/fhe-computation/advanced-features/zk-pok.md
@@ -16,6 +16,9 @@ Note that you need to use dedicated parameters for the compact public key encryp

 The following example shows how a client can encrypt and prove a ciphertext, and how a server can verify and compute the ciphertext:

+Note that you need to use dedicated parameters for the compact public key encryption. This helps to reduce the size of encrypted data and speed up the zero-knowledge proof computation.
+
+The following example shows how to generate a CRS and proofs for ZKV1. Compared to the previous example, only the parameters are changed:
 ```rust
 use rand::prelude::*;
 use tfhe::prelude::*;
--- a/tfhe/js_on_wasm_tests/test-hlapi-signed.js
+++ b/tfhe/js_on_wasm_tests/test-hlapi-signed.js
@@ -569,15 +569,15 @@ test("hlapi_compact_ciphertext_list_with_proof", (t) => {
    ShortintCompactPublicKeyEncryptionParametersName.PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
  );

-  let config = TfheConfigBuilder.default()
-    .use_custom_parameters(block_params)
-    .use_dedicated_compact_public_key_parameters(publicKeyParams)
-    .build();
+    let config = TfheConfigBuilder.default()
+        .use_custom_parameters(block_params)
+				.use_dedicated_compact_public_key_parameters(publicKeyParams)
+        .build();

  let clientKey = TfheClientKey.generate(config);
  let publicKey = TfheCompactPublicKey.new(clientKey);

-  let crs = CompactPkeCrs.from_config(config, 2 + 32 + 1 + 256);
+    let crs = CompactPkeCrs.from_config(config, 2 + 32 + 1 + 256);

  const compress = false; // We don't compress as it's too slow on wasm
  let serialized_pke_crs = crs.serialize(compress);
--- a/tfhe/src/core_crypto/commons/mod.rs
+++ b/tfhe/src/core_crypto/commons/mod.rs
@@ -131,6 +131,73 @@ pub mod test_tools {
        }
    }

+    pub fn mean_confidence_interval(
+        sample_count: f64,
+        measured_mean: f64,
+        measured_std_dev: StandardDev,
+        probability_to_be_in_the_interval: f64,
+    ) -> MeanConfidenceInterval {
+        let standard_score = core::f64::consts::SQRT_2
+            * statrs::function::erf::erfc_inv(1.0 - probability_to_be_in_the_interval);
+        let interval_delta = standard_score * measured_std_dev.0 / f64::sqrt(sample_count);
+
+        let lower_bound = measured_mean - interval_delta;
+        let upper_bound = measured_mean + interval_delta;
+
+        assert!(lower_bound <= upper_bound);
+
+        MeanConfidenceInterval {
+            lower_bound,
+            upper_bound,
+        }
+    }
+
+    /// Return a MeanConfidenceInterval when you cannot evaluate the standard deviation of a random
+    /// variable
+    pub fn clopper_pearson_exact_confidence_interval(
+        sample_count: f64,
+        measured_fails: f64,
+        confidence_level: f64,
+    ) -> MeanConfidenceInterval {
+        // import scipy.stats as stats
+        // import math
+        //
+        // # Parameters
+        // n_tests = 10000  # Number of trials
+        // p_fail = 2**-5.5  # Theoretical probability of failure
+        // alpha = 0.01  # Significance level (1 - confidence level)
+        //
+        // # /!\ to be replaced by the experimental number of failures
+        // observed_failures = round(n_tests * p_fail)
+        //
+        // # Clopper-Pearson Exact Confidence Interval
+        // lower_bound = stats.beta.ppf(alpha / 2, observed_failures, n_tests - observed_failures +
+        // 1) upper_bound = stats.beta.ppf(1 - alpha / 2, observed_failures + 1, n_tests -
+        // observed_failures)
+        //
+        // print("Observed number of failures (k):", observed_failures)
+        // print(f"Confidence Interval ({(1-alpha)*100}%): [2^{round(math.log2(lower_bound),3)},
+        // 2^{round(math.log2(upper_bound),3)}]")
+
+        let alpha = 1.0 - confidence_level;
+        let beta_distribution_lower_bound =
+            statrs::distribution::Beta::new(measured_fails, sample_count - measured_fails + 1.0)
+                .unwrap();
+        let beta_distribution_upper_bound =
+            statrs::distribution::Beta::new(measured_fails + 1.0, sample_count - measured_fails)
+                .unwrap();
+
+        let lower_bound = beta_distribution_lower_bound.inverse_cdf(alpha / 2.0);
+        let upper_bound = beta_distribution_upper_bound.inverse_cdf(1.0 - alpha / 2.0);
+
+        assert!(lower_bound <= upper_bound);
+
+        MeanConfidenceInterval {
+            lower_bound,
+            upper_bound,
+        }
+    }
+
    pub fn variance(samples: &[f64]) -> Variance {
        let sample_count = samples.len();

@@ -202,6 +269,32 @@ pub mod test_tools {
        }
    }

+    pub fn variance_confidence_interval(
+        sample_count: f64,
+        measured_variance: Variance,
+        probability_to_be_in_the_interval: f64,
+    ) -> VarianceConfidenceInterval {
+        assert!(probability_to_be_in_the_interval >= 0.0);
+        assert!(probability_to_be_in_the_interval <= 1.0);
+        let alpha = 1.0 - probability_to_be_in_the_interval;
+        let degrees_of_freedom = sample_count - 1.0;
+        let chi2 = ChiSquared::new(degrees_of_freedom).unwrap();
+        let chi2_lower = chi2.inverse_cdf(alpha / 2.0);
+        let chi2_upper = chi2.inverse_cdf(1.0 - alpha / 2.0);
+
+        // Lower bound is divided by Chi_right^2 so by chi2_upper, upper bound divided by Chi_left^2
+        // so chi2_lower
+        let lower_bound = Variance(degrees_of_freedom * measured_variance.0 / chi2_upper);
+        let upper_bound = Variance(degrees_of_freedom * measured_variance.0 / chi2_lower);
+
+        assert!(lower_bound <= upper_bound);
+
+        VarianceConfidenceInterval {
+            lower_bound,
+            upper_bound,
+        }
+    }
+
    pub fn new_random_generator() -> RandomGenerator<DefaultRandomGenerator> {
        RandomGenerator::new(random_seed())
    }
--- a/tfhe/src/core_crypto/commons/noise_formulas/generalized_modulus_switch.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/generalized_modulus_switch.rs
@@ -0,0 +1,33 @@
+// This file was autogenerated, do not modify by hand.
+use crate::core_crypto::commons::dispersion::Variance;
+use crate::core_crypto::commons::parameters::*;
+
+/// This formula is only valid when going from a larger to a smaller modulus
+/// This formula is based on a heuristic, so may not always be valid
+#[allow(dead_code)]
+pub(crate) fn generalized_modulus_switch_additive_variance(
+    input_lwe_dimension: LweDimension,
+    modulus: f64,
+    new_modulus: f64,
+) -> Variance {
+    Variance(generalized_modulus_switch_additive_variance_impl(
+        input_lwe_dimension.0 as f64,
+        modulus,
+        new_modulus,
+    ))
+}
+
+/// This formula is only valid when going from a larger to a smaller modulus
+/// This formula is based on a heuristic, so may not always be valid
+#[allow(dead_code)]
+pub(crate) fn generalized_modulus_switch_additive_variance_impl(
+    input_lwe_dimension: f64,
+    modulus: f64,
+    new_modulus: f64,
+) -> f64 {
+    (1_f64 / 2.0)
+        * input_lwe_dimension
+        * (0.0208333333333333 * modulus.powf(-2.0) + 0.0416666666666667 * new_modulus.powf(-2.0))
+        + (1_f64 / 12.0) * modulus.powf(-2.0)
+        + (1_f64 / 24.0) * new_modulus.powf(-2.0)
+}
--- a/tfhe/src/core_crypto/commons/noise_formulas/lwe_keyswitch.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/lwe_keyswitch.rs
@@ -33,12 +33,60 @@ pub fn keyswitch_additive_variance_132_bits_security_gaussian_impl(
    decomposition_level_count: f64,
    modulus: f64,
 ) -> f64 {
-    (1_f64 / 3.0)
-        * decomposition_level_count
+    decomposition_level_count
        * input_lwe_dimension
-        * ((5.31469187675068 - 0.0497829131652661 * output_lwe_dimension).exp2()
-            + 16.0 * modulus.powf(-2.0))
-        * ((1_f64 / 4.0) * decomposition_base.powf(2.0) + 0.5)
+        * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+            + (5.31469187675068 - 0.0497829131652661 * output_lwe_dimension).exp2())
+        * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
+        + input_lwe_dimension
+            * (0.0208333333333333 * modulus.powf(-2.0)
+                + 0.0416666666666667 * decomposition_base.powf(-2.0 * decomposition_level_count))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn keyswitch_additive_variance_132_bits_security_tuniform(
+    input_lwe_dimension: LweDimension,
+    output_lwe_dimension: LweDimension,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+    modulus: f64,
+) -> Variance {
+    Variance(keyswitch_additive_variance_132_bits_security_tuniform_impl(
+        input_lwe_dimension.0 as f64,
+        output_lwe_dimension.0 as f64,
+        2.0f64.powi(decomposition_base_log.0 as i32),
+        decomposition_level_count.0 as f64,
+        modulus,
+    ))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn keyswitch_additive_variance_132_bits_security_tuniform_impl(
+    input_lwe_dimension: f64,
+    output_lwe_dimension: f64,
+    decomposition_base: f64,
+    decomposition_level_count: f64,
+    modulus: f64,
+) -> f64 {
+    decomposition_level_count
+        * input_lwe_dimension
+        * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+            + (1_f64 / 3.0)
+                * modulus.powf(-2.0)
+                * ((2.0
+                    * (-0.025167785 * output_lwe_dimension
+                        + 1.44269504088896 * modulus.ln()
+                        + 4.10067100000001)
+                        .ceil())
+                .exp2()
+                    + 0.5))
+        * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
        + input_lwe_dimension
            * (0.0208333333333333 * modulus.powf(-2.0)
                + 0.0416666666666667 * decomposition_base.powf(-2.0 * decomposition_level_count))
--- a/tfhe/src/core_crypto/commons/noise_formulas/lwe_packing_keyswitch.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/lwe_packing_keyswitch.rs
@@ -0,0 +1,115 @@
+// This file was autogenerated, do not modify by hand.
+use crate::core_crypto::commons::dispersion::Variance;
+use crate::core_crypto::commons::parameters::*;
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn packing_keyswitch_additive_variance_132_bits_security_gaussian(
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+    lwe_to_pack: f64,
+    modulus: f64,
+) -> Variance {
+    Variance(
+        packing_keyswitch_additive_variance_132_bits_security_gaussian_impl(
+            input_lwe_dimension.0 as f64,
+            output_glwe_dimension.0 as f64,
+            output_polynomial_size.0 as f64,
+            2.0f64.powi(decomposition_base_log.0 as i32),
+            decomposition_level_count.0 as f64,
+            lwe_to_pack,
+            modulus,
+        ),
+    )
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn packing_keyswitch_additive_variance_132_bits_security_gaussian_impl(
+    input_lwe_dimension: f64,
+    output_glwe_dimension: f64,
+    output_polynomial_size: f64,
+    decomposition_base: f64,
+    decomposition_level_count: f64,
+    lwe_to_pack: f64,
+    modulus: f64,
+) -> f64 {
+    decomposition_level_count
+        * input_lwe_dimension
+        * lwe_to_pack
+        * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+            + (-0.0497829131652661 * output_glwe_dimension * output_polynomial_size
+                + 5.31469187675068)
+                .exp2())
+        * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
+        + 0.5
+            * input_lwe_dimension
+            * ((1_f64 / 6.0) * modulus.powf(-2.0)
+                + (1_f64 / 12.0) * decomposition_base.powf(-2.0 * decomposition_level_count))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn packing_keyswitch_additive_variance_132_bits_security_tuniform(
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+    lwe_to_pack: f64,
+    modulus: f64,
+) -> Variance {
+    Variance(
+        packing_keyswitch_additive_variance_132_bits_security_tuniform_impl(
+            input_lwe_dimension.0 as f64,
+            output_glwe_dimension.0 as f64,
+            output_polynomial_size.0 as f64,
+            2.0f64.powi(decomposition_base_log.0 as i32),
+            decomposition_level_count.0 as f64,
+            lwe_to_pack,
+            modulus,
+        ),
+    )
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn packing_keyswitch_additive_variance_132_bits_security_tuniform_impl(
+    input_lwe_dimension: f64,
+    output_glwe_dimension: f64,
+    output_polynomial_size: f64,
+    decomposition_base: f64,
+    decomposition_level_count: f64,
+    lwe_to_pack: f64,
+    modulus: f64,
+) -> f64 {
+    decomposition_level_count
+        * input_lwe_dimension
+        * lwe_to_pack
+        * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+            + (1_f64 / 3.0)
+                * modulus.powf(-2.0)
+                * ((2.0
+                    * (-0.025167785 * output_glwe_dimension * output_polynomial_size
+                        + 1.44269504088896 * modulus.ln()
+                        + 4.10067100000001)
+                        .ceil())
+                .exp2()
+                    + 0.5))
+        * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
+        + 0.5
+            * input_lwe_dimension
+            * ((1_f64 / 6.0) * modulus.powf(-2.0)
+                + (1_f64 / 12.0) * decomposition_base.powf(-2.0 * decomposition_level_count))
+}
--- a/tfhe/src/core_crypto/commons/noise_formulas/lwe_programmable_bootstrap.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/lwe_programmable_bootstrap.rs
@@ -28,6 +28,7 @@ pub fn pbs_variance_132_bits_security_gaussian(
 /// if the keys used are encrypted using secure noise given by the
 /// [`minimal_glwe_variance`](`super::secure_noise`)
 /// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+#[allow(clippy::suspicious_operation_groupings)]
 pub fn pbs_variance_132_bits_security_gaussian_impl(
    input_lwe_dimension: f64,
    output_glwe_dimension: f64,
@@ -42,14 +43,78 @@ pub fn pbs_variance_132_bits_security_gaussian_impl(
            * decomposition_level_count
            * output_polynomial_size.powf(2.0)
            * (output_glwe_dimension + 1.0)
-            + (1_f64 / 3.0)
-                * decomposition_level_count
+            + decomposition_level_count
                * output_polynomial_size
-                * ((-0.0497829131652661 * output_glwe_dimension * output_polynomial_size
-                    + 5.31469187675068)
-                    .exp2()
-                    + 16.0 * modulus.powf(-2.0))
-                * ((1_f64 / 4.0) * decomposition_base.powf(2.0) + 0.5)
+                * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+                    + (-0.0497829131652661 * output_glwe_dimension * output_polynomial_size
+                        + 5.31469187675068)
+                        .exp2())
+                * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
+                * (output_glwe_dimension + 1.0)
+            + (1_f64 / 12.0) * modulus.powf(-2.0)
+            + (1_f64 / 2.0)
+                * output_glwe_dimension
+                * output_polynomial_size
+                * (0.0208333333333333 * modulus.powf(-2.0)
+                    + 0.0416666666666667
+                        * decomposition_base.powf(-2.0 * decomposition_level_count))
+            + (1_f64 / 24.0) * decomposition_base.powf(-2.0 * decomposition_level_count))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn pbs_variance_132_bits_security_tuniform(
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+    modulus: f64,
+) -> Variance {
+    Variance(pbs_variance_132_bits_security_tuniform_impl(
+        input_lwe_dimension.0 as f64,
+        output_glwe_dimension.0 as f64,
+        output_polynomial_size.0 as f64,
+        2.0f64.powi(decomposition_base_log.0 as i32),
+        decomposition_level_count.0 as f64,
+        modulus,
+    ))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+#[allow(clippy::suspicious_operation_groupings)]
+pub fn pbs_variance_132_bits_security_tuniform_impl(
+    input_lwe_dimension: f64,
+    output_glwe_dimension: f64,
+    output_polynomial_size: f64,
+    decomposition_base: f64,
+    decomposition_level_count: f64,
+    modulus: f64,
+) -> f64 {
+    input_lwe_dimension
+        * (2.06537277069845e-33
+            * decomposition_base.powf(2.0)
+            * decomposition_level_count
+            * output_polynomial_size.powf(2.0)
+            * (output_glwe_dimension + 1.0)
+            + decomposition_level_count
+                * output_polynomial_size
+                * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+                    + (1_f64 / 3.0)
+                        * modulus.powf(-2.0)
+                        * ((2.0
+                            * (-0.025167785 * output_glwe_dimension * output_polynomial_size
+                                + 1.44269504088896 * modulus.ln()
+                                + 4.10067100000001)
+                                .ceil())
+                        .exp2()
+                            + 0.5))
+                * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
                * (output_glwe_dimension + 1.0)
            + (1_f64 / 12.0) * modulus.powf(-2.0)
            + (1_f64 / 2.0)
--- a/tfhe/src/core_crypto/commons/noise_formulas/lwe_programmable_bootstrap_128.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/lwe_programmable_bootstrap_128.rs
@@ -0,0 +1,133 @@
+// This file was autogenerated, do not modify by hand.
+use crate::core_crypto::commons::dispersion::Variance;
+use crate::core_crypto::commons::parameters::*;
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn pbs_128_variance_132_bits_security_gaussian(
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+    mantissa_size: f64,
+    modulus: f64,
+) -> Variance {
+    Variance(pbs_128_variance_132_bits_security_gaussian_impl(
+        input_lwe_dimension.0 as f64,
+        output_glwe_dimension.0 as f64,
+        output_polynomial_size.0 as f64,
+        2.0f64.powi(decomposition_base_log.0 as i32),
+        decomposition_level_count.0 as f64,
+        mantissa_size,
+        modulus,
+    ))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn pbs_128_variance_132_bits_security_gaussian_impl(
+    input_lwe_dimension: f64,
+    output_glwe_dimension: f64,
+    output_polynomial_size: f64,
+    decomposition_base: f64,
+    decomposition_level_count: f64,
+    mantissa_size: f64,
+    modulus: f64,
+) -> f64 {
+    input_lwe_dimension
+        * (decomposition_level_count
+            * output_polynomial_size
+            * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+                + (-0.0497829131652661 * output_glwe_dimension * output_polynomial_size
+                    + 5.31469187675068)
+                    .exp2())
+            * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
+            * (output_glwe_dimension + 1.0)
+            + (1_f64 / 12.0) * modulus.powf(-2.0)
+            + (1_f64 / 2.0)
+                * output_glwe_dimension
+                * output_polynomial_size
+                * (0.0208333333333333 * modulus.powf(-2.0)
+                    + 0.0416666666666667
+                        * decomposition_base.powf(-2.0 * decomposition_level_count))
+            + (1_f64 / 24.0) * decomposition_base.powf(-2.0 * decomposition_level_count)
+            + 0.16756294607814
+                * (-2.0 * mantissa_size).exp2()
+                * decomposition_base.powf(2.0)
+                * decomposition_level_count
+                * output_polynomial_size.powf(2.0)
+                * (output_glwe_dimension + 1.0))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn pbs_128_variance_132_bits_security_tuniform(
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    decomposition_base_log: DecompositionBaseLog,
+    decomposition_level_count: DecompositionLevelCount,
+    mantissa_size: f64,
+    modulus: f64,
+) -> Variance {
+    Variance(pbs_128_variance_132_bits_security_tuniform_impl(
+        input_lwe_dimension.0 as f64,
+        output_glwe_dimension.0 as f64,
+        output_polynomial_size.0 as f64,
+        2.0f64.powi(decomposition_base_log.0 as i32),
+        decomposition_level_count.0 as f64,
+        mantissa_size,
+        modulus,
+    ))
+}
+
+/// This formula is only valid if the proper noise distributions are used and
+/// if the keys used are encrypted using secure noise given by the
+/// [`minimal_glwe_variance`](`super::secure_noise`)
+/// and [`minimal_lwe_variance`](`super::secure_noise`) family of functions.
+pub fn pbs_128_variance_132_bits_security_tuniform_impl(
+    input_lwe_dimension: f64,
+    output_glwe_dimension: f64,
+    output_polynomial_size: f64,
+    decomposition_base: f64,
+    decomposition_level_count: f64,
+    mantissa_size: f64,
+    modulus: f64,
+) -> f64 {
+    input_lwe_dimension
+        * (decomposition_level_count
+            * output_polynomial_size
+            * ((4.0 - 2.88539008177793 * modulus.ln()).exp2()
+                + (1_f64 / 3.0)
+                    * modulus.powf(-2.0)
+                    * ((2.0
+                        * (-0.025167785 * output_glwe_dimension * output_polynomial_size
+                            + 1.44269504088896 * modulus.ln()
+                            + 4.10067100000001)
+                            .ceil())
+                    .exp2()
+                        + 0.5))
+            * ((1_f64 / 12.0) * decomposition_base.powf(2.0) + 0.166666666666667)
+            * (output_glwe_dimension + 1.0)
+            + (1_f64 / 12.0) * modulus.powf(-2.0)
+            + (1_f64 / 2.0)
+                * output_glwe_dimension
+                * output_polynomial_size
+                * (0.0208333333333333 * modulus.powf(-2.0)
+                    + 0.0416666666666667
+                        * decomposition_base.powf(-2.0 * decomposition_level_count))
+            + (1_f64 / 24.0) * decomposition_base.powf(-2.0 * decomposition_level_count)
+            + 0.16756294607814
+                * (-2.0 * mantissa_size).exp2()
+                * decomposition_base.powf(2.0)
+                * decomposition_level_count
+                * output_polynomial_size.powf(2.0)
+                * (output_glwe_dimension + 1.0))
+}
--- a/tfhe/src/core_crypto/commons/noise_formulas/mod.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/mod.rs
@@ -1,5 +1,9 @@
 // This file was autogenerated, do not modify by hand.
+pub mod generalized_modulus_switch;
 pub mod lwe_keyswitch;
 pub mod lwe_multi_bit_programmable_bootstrap;
+pub mod lwe_packing_keyswitch;
 pub mod lwe_programmable_bootstrap;
+pub mod lwe_programmable_bootstrap_128;
+pub mod modulus_switch;
 pub mod secure_noise;
--- a/tfhe/src/core_crypto/commons/noise_formulas/modulus_switch.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/modulus_switch.rs
@@ -0,0 +1,47 @@
+// This file was autogenerated, do not modify by hand.
+use crate::core_crypto::commons::dispersion::Variance;
+use crate::core_crypto::commons::parameters::*;
+
+/// This formula is only valid when going from a larger to a smaller modulus
+pub fn modulus_switch_additive_variance(
+    input_lwe_dimension: LweDimension,
+    modulus: f64,
+    new_modulus: f64,
+) -> Variance {
+    Variance(modulus_switch_additive_variance_impl(
+        input_lwe_dimension.0 as f64,
+        modulus,
+        new_modulus,
+    ))
+}
+
+/// This formula is only valid when going from a larger to a smaller modulus
+pub fn modulus_switch_additive_variance_impl(
+    input_lwe_dimension: f64,
+    modulus: f64,
+    new_modulus: f64,
+) -> f64 {
+    input_lwe_dimension
+        * (0.0208333333333333 * modulus.powf(-2.0) + 0.0416666666666667 * new_modulus.powf(-2.0))
+        + (1_f64 / 6.0) * modulus.powf(-2.0)
+        + (1_f64 / 12.0) * new_modulus.powf(-2.0)
+}
+
+/// This formula is only valid when going from a larger to a smaller modulus
+pub fn modulus_switch_multi_bit_additive_variance(
+    input_lwe_dimension: LweDimension,
+    modulus: f64,
+    new_modulus: f64,
+    grouping_factor: f64,
+) -> Variance {
+    let multi_bit_factor = (2_f64 / grouping_factor) * (2_f64.powf(grouping_factor) - 1_f64)
+        / 2_f64.powf(grouping_factor);
+    Variance(
+        multi_bit_factor
+            * modulus_switch_additive_variance_impl(
+                input_lwe_dimension.0 as f64,
+                modulus,
+                new_modulus,
+            ),
+    )
+}
--- a/tfhe/src/core_crypto/commons/noise_formulas/secure_noise.rs
+++ b/tfhe/src/core_crypto/commons/noise_formulas/secure_noise.rs
@@ -25,7 +25,64 @@ pub fn minimal_variance_for_132_bits_security_gaussian_impl(
    lwe_dimension: f64,
    modulus: f64,
 ) -> f64 {
-    (5.31469187675068 - 0.0497829131652661 * lwe_dimension).exp2() + 16.0 * modulus.powf(-2.0)
+    (4.0 - 2.88539008177793 * modulus.ln()).exp2()
+        + (5.31469187675068 - 0.0497829131652661 * lwe_dimension).exp2()
+}
+
+// pub fn minimal_glwe_variance_for_132_bits_security_tuniform(
+//     glwe_dimension: GlweDimension,
+//     polynomial_size: PolynomialSize,
+//     modulus: f64,
+// ) -> Variance {
+//     let lwe_dimension = glwe_dimension.to_equivalent_lwe_dimension(polynomial_size);
+//     minimal_lwe_variance_for_132_bits_security_tuniform(lwe_dimension, modulus)
+// }
+
+// pub fn minimal_lwe_variance_for_132_bits_security_tuniform(
+//     lwe_dimension: LweDimension,
+//     modulus: f64,
+// ) -> Variance {
+//     Variance(minimal_variance_for_132_bits_security_tuniform_impl(
+//         lwe_dimension.0 as f64,
+//         modulus,
+//     ))
+// }
+
+pub fn minimal_variance_for_132_bits_security_tuniform_impl(
+    lwe_dimension: f64,
+    modulus: f64,
+) -> f64 {
+    (4.0 - 2.88539008177793 * modulus.ln()).exp2()
+        + (1_f64 / 3.0)
+            * modulus.powf(-2.0)
+            * ((2.0
+                * (-0.025167785 * lwe_dimension
+                    + 1.44269504088896 * modulus.ln()
+                    + 4.10067100000001)
+                    .ceil())
+            .exp2()
+                + 0.5)
+}
+
+pub fn minimal_glwe_bound_for_132_bits_security_tuniform(
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    modulus: f64,
+) -> u32 {
+    let lwe_dimension = glwe_dimension.to_equivalent_lwe_dimension(polynomial_size);
+    minimal_lwe_bound_for_132_bits_security_tuniform(lwe_dimension, modulus)
+}
+
+pub fn minimal_lwe_bound_for_132_bits_security_tuniform(
+    lwe_dimension: LweDimension,
+    modulus: f64,
+) -> u32 {
+    minimal_bound_for_132_bits_security_tuniform_impl(lwe_dimension.0 as f64, modulus)
+}
+
+pub fn minimal_bound_for_132_bits_security_tuniform_impl(lwe_dimension: f64, modulus: f64) -> u32 {
+    ((-0.025167785 * lwe_dimension + 1.44269504088896 * modulus.ln() + 4.10067100000001).ceil())
+        as u32
 }

 pub fn minimal_glwe_variance_for_132_bits_security_tuniform(
@@ -47,18 +104,18 @@ pub fn minimal_lwe_variance_for_132_bits_security_tuniform(
    ))
 }

-pub fn minimal_variance_for_132_bits_security_tuniform_impl(
-    lwe_dimension: f64,
-    modulus: f64,
-) -> f64 {
-    (4.0 - 2.88539008177793 * modulus.ln()).exp2()
-        + (1_f64 / 3.0)
-            * modulus.powf(-2.0)
-            * ((2.0
-                * (-0.025167785 * lwe_dimension
-                    + std::f64::consts::LOG2_E * modulus.ln()
-                    + 4.10067100000001)
-                    .ceil())
-            .exp2()
-                + 0.5)
-}
+// pub fn minimal_variance_for_132_bits_security_tuniform_impl(
+//     lwe_dimension: f64,
+//     modulus: f64,
+// ) -> f64 {
+//     (4.0 - 2.88539008177793 * modulus.ln()).exp2()
+//         + (1_f64 / 3.0)
+//             * modulus.powf(-2.0)
+//             * ((2.0
+//                 * (-0.025167785 * lwe_dimension
+//                     + std::f64::consts::LOG2_E * modulus.ln()
+//                     + 4.10067100000001)
+//                     .ceil())
+//             .exp2()
+//                 + 0.5)
+// }
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_keyswitch.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_keyswitch.rs
@@ -1,7 +1,10 @@
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
 use crate::core_crypto::gpu::vec::CudaVec;
-use crate::core_crypto::gpu::{keyswitch_async, CudaStreams};
+use crate::core_crypto::gpu::{
+    cuda_modulus_switch_ciphertext_async, cuda_modulus_switch_multi_bit_ciphertext_async,
+    cuda_modulus_switch_multi_bit_ciphertext_u128_async, keyswitch_async, CudaStreams,
+};
 use crate::core_crypto::prelude::UnsignedInteger;

 /// # Safety
@@ -107,3 +110,65 @@ pub fn cuda_keyswitch_lwe_ciphertext<Scalar>(
    }
    streams.synchronize();
 }
+
+pub fn cuda_modulus_switch_ciphertext<Scalar>(
+    output_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
+    log_modulus: u32,
+    streams: &CudaStreams,
+) where
+    Scalar: UnsignedInteger,
+{
+    unsafe {
+        cuda_modulus_switch_ciphertext_async(
+            streams,
+            &mut output_lwe_ciphertext.0.d_vec,
+            log_modulus,
+        );
+    }
+    streams.synchronize();
+}
+
+pub fn cuda_modulus_switch_multi_bit_ciphertext<Scalar>(
+    lwe_array_out: &mut CudaVec<Scalar>,
+    input_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
+    log_modulus: u32,
+    polynomial_size: u32,
+    grouping_factor: u32,
+    streams: &CudaStreams,
+) where
+    Scalar: UnsignedInteger,
+{
+    unsafe {
+        cuda_modulus_switch_multi_bit_ciphertext_async(
+            streams,
+            lwe_array_out,
+            &mut input_lwe_ciphertext.0.d_vec,
+            log_modulus,
+            polynomial_size,
+            grouping_factor,
+        );
+    }
+    streams.synchronize();
+}
+pub fn cuda_modulus_switch_multi_bit_ciphertext_u128<Scalar>(
+    lwe_array_out: &mut CudaVec<Scalar>,
+    input_lwe_ciphertext: &mut CudaLweCiphertextList<Scalar>,
+    log_modulus: u32,
+    polynomial_size: u32,
+    grouping_factor: u32,
+    streams: &CudaStreams,
+) where
+    Scalar: UnsignedInteger,
+{
+    unsafe {
+        cuda_modulus_switch_multi_bit_ciphertext_u128_async(
+            streams,
+            lwe_array_out,
+            &mut input_lwe_ciphertext.0.d_vec,
+            log_modulus,
+            polynomial_size,
+            grouping_factor,
+        );
+    }
+    streams.synchronize();
+}
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_linear_algebra.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_linear_algebra.rs
@@ -5,7 +5,8 @@ use crate::core_crypto::gpu::{
    add_lwe_ciphertext_vector_plaintext_vector_assign_async,
    add_lwe_ciphertext_vector_plaintext_vector_async, mult_lwe_ciphertext_vector_cleartext_vector,
    mult_lwe_ciphertext_vector_cleartext_vector_assign_async,
-    negate_lwe_ciphertext_vector_assign_async, negate_lwe_ciphertext_vector_async, CudaStreams,
+    negate_lwe_ciphertext_vector_assign_async, negate_lwe_ciphertext_vector_async,
+    sub_lwe_ciphertext_vector_plaintext_vector_assign_async, CudaStreams,
 };
 use crate::core_crypto::prelude::UnsignedInteger;

@@ -222,6 +223,29 @@ pub unsafe fn cuda_lwe_ciphertext_plaintext_add_assign_async<Scalar>(
    );
 }

+/// # Safety
+///
+/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until stream is synchronised
+pub unsafe fn cuda_lwe_ciphertext_plaintext_sub_assign_async<Scalar>(
+    lhs: &mut CudaLweCiphertextList<Scalar>,
+    rhs: &CudaVec<Scalar>,
+    stream: &CudaStreams,
+) where
+    Scalar: UnsignedInteger,
+{
+    let num_samples = lhs.lwe_ciphertext_count().0 as u32;
+    let lwe_dimension = &lhs.lwe_dimension();
+
+    sub_lwe_ciphertext_vector_plaintext_vector_assign_async(
+        stream,
+        &mut lhs.0.d_vec,
+        rhs,
+        *lwe_dimension,
+        num_samples,
+    );
+}
+
 /// # Safety
 ///
 /// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
@@ -417,6 +441,19 @@ pub fn cuda_lwe_ciphertext_plaintext_add_assign<Scalar>(
    streams.synchronize();
 }

+pub fn cuda_lwe_ciphertext_plaintext_sub_assign<Scalar>(
+    lhs: &mut CudaLweCiphertextList<Scalar>,
+    rhs: &CudaVec<Scalar>,
+    stream: &CudaStreams,
+) where
+    Scalar: UnsignedInteger,
+{
+    unsafe {
+        cuda_lwe_ciphertext_plaintext_sub_assign_async(lhs, rhs, stream);
+    }
+    stream.synchronize();
+}
+
 pub fn cuda_lwe_ciphertext_negate<Scalar>(
    output: &mut CudaLweCiphertextList<Scalar>,
    input: &CudaLweCiphertextList<Scalar>,
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
@@ -150,6 +150,149 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
    );
 }

+/// # Safety
+///
+/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until streams is synchronised
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_no_ms_noise_reduction_async<Scalar>(
+    input: &CudaLweCiphertextList<Scalar>,
+    output: &mut CudaLweCiphertextList<Scalar>,
+    accumulator: &CudaGlweCiphertextList<Scalar>,
+    lut_indexes: &CudaVec<Scalar>,
+    output_indexes: &CudaVec<Scalar>,
+    input_indexes: &CudaVec<Scalar>,
+    num_samples: LweCiphertextCount,
+    bsk: &CudaLweBootstrapKey,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    assert_eq!(
+        input.lwe_dimension(),
+        bsk.input_lwe_dimension(),
+        "Mismatched input LweDimension. LweCiphertext input LweDimension {:?}. \
+        FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
+        input.lwe_dimension(),
+        bsk.input_lwe_dimension(),
+    );
+
+    assert_eq!(
+        output.lwe_dimension(),
+        bsk.output_lwe_dimension(),
+        "Mismatched output LweDimension. LweCiphertext output LweDimension {:?}. \
+        FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
+        output.lwe_dimension(),
+        bsk.output_lwe_dimension(),
+    );
+
+    assert_eq!(
+        accumulator.glwe_dimension(),
+        bsk.glwe_dimension(),
+        "Mismatched GlweSize. Accumulator GlweSize {:?}. \
+        FourierLweMultiBitBootstrapKey GlweSize {:?}.",
+        accumulator.glwe_dimension(),
+        bsk.glwe_dimension(),
+    );
+
+    assert_eq!(
+        accumulator.polynomial_size(),
+        bsk.polynomial_size(),
+        "Mismatched PolynomialSize. Accumulator PolynomialSize {:?}. \
+        FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
+        accumulator.polynomial_size(),
+        bsk.polynomial_size(),
+    );
+
+    assert_eq!(
+        input.ciphertext_modulus(),
+        output.ciphertext_modulus(),
+        "Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
+        input.ciphertext_modulus(),
+        output.ciphertext_modulus(),
+    );
+
+    assert_eq!(
+        input.ciphertext_modulus(),
+        accumulator.ciphertext_modulus(),
+        "Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
+        input.ciphertext_modulus(),
+        accumulator.ciphertext_modulus(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bsk.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        bsk.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        accumulator.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first accumulator pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        accumulator.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        input_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first input indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        input_indexes.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        output_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first output indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        output_indexes.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lut_indexes.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first lut indexes pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        lut_indexes.gpu_index(0).get(),
+    );
+
+    let lwe_dimension = input.lwe_dimension();
+    let ct_modulus = input.ciphertext_modulus().raw_modulus_float();
+
+    programmable_bootstrap_async(
+        streams,
+        &mut output.0.d_vec,
+        output_indexes,
+        &accumulator.0.d_vec,
+        lut_indexes,
+        &input.0.d_vec,
+        input_indexes,
+        &bsk.d_vec,
+        lwe_dimension,
+        bsk.glwe_dimension(),
+        bsk.polynomial_size(),
+        bsk.decomp_base_log(),
+        bsk.decomp_level_count(),
+        num_samples.0 as u32,
+        None,
+        ct_modulus,
+    );
+}
+
 /// # Safety
 ///
 /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
@@ -302,6 +445,36 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
    streams.synchronize();
 }

+#[allow(clippy::too_many_arguments)]
+pub fn cuda_programmable_bootstrap_lwe_ciphertext_no_ms_noise_reduction<Scalar>(
+    input: &CudaLweCiphertextList<Scalar>,
+    output: &mut CudaLweCiphertextList<Scalar>,
+    accumulator: &CudaGlweCiphertextList<Scalar>,
+    lut_indexes: &CudaVec<Scalar>,
+    output_indexes: &CudaVec<Scalar>,
+    input_indexes: &CudaVec<Scalar>,
+    num_samples: LweCiphertextCount,
+    bsk: &CudaLweBootstrapKey,
+    streams: &CudaStreams,
+) where
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    unsafe {
+        cuda_programmable_bootstrap_lwe_ciphertext_no_ms_noise_reduction_async(
+            input,
+            output,
+            accumulator,
+            lut_indexes,
+            output_indexes,
+            input_indexes,
+            num_samples,
+            bsk,
+            streams,
+        );
+    }
+    streams.synchronize();
+}
+
 /// Performs a programmable bootstrap (PBS) on a list of 128-bit LWE ciphertexts,
 /// storing the result back into the provided `output` list at matching indices.
 ///
--- a/tfhe/src/core_crypto/gpu/entities/lwe_keyswitch_key.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_keyswitch_key.rs
@@ -3,7 +3,7 @@
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::{
    convert_lwe_keyswitch_key_async, CiphertextModulus, CudaStreams, DecompositionBaseLog,
-    DecompositionLevelCount,
+    DecompositionLevelCount, LweDimension,
 };
 use crate::core_crypto::prelude::{
    lwe_keyswitch_key_input_key_element_encrypted_size, LweKeyswitchKeyOwned, LweSize,
@@ -69,4 +69,14 @@ impl<T: UnsignedInteger> CudaLweKeyswitchKey<T> {
    pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount {
        self.decomp_level_count
    }
+
+    pub(crate) fn input_key_lwe_dimension(&self) -> LweDimension {
+        self.input_key_lwe_size().to_lwe_dimension()
+    }
+    pub(crate) fn output_key_lwe_dimension(&self) -> LweDimension {
+        self.output_key_lwe_size().to_lwe_dimension()
+    }
+    pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus<T> {
+        self.ciphertext_modulus
+    }
 }
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -8,9 +8,9 @@ use crate::core_crypto::gpu::lwe_bootstrap_key::{
 };
 use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
 use crate::core_crypto::prelude::{
-    CiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, GlweCiphertextCount,
-    GlweDimension, LweBskGroupingFactor, LweCiphertextCount, LweDimension, PolynomialSize,
-    UnsignedInteger,
+    CiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, DispersionParameter,
+    GlweCiphertextCount, GlweDimension, LweBskGroupingFactor, LweCiphertextCount, LweDimension,
+    PolynomialSize, UnsignedInteger,
 };
 pub use algorithms::*;
 pub use entities::*;
@@ -697,6 +697,91 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
    );
 }

+#[allow(clippy::too_many_arguments)]
+pub fn cuda_improve_noise_modulus_switch_ciphertext<T: UnsignedInteger>(
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    lwe_in_indexes: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    num_samples: u32,
+    log_modulus: u32,
+    modulus: f64,
+    noise_reduction_key: &CudaModulusSwitchNoiseReductionKey,
+    streams: &CudaStreams,
+) {
+    unsafe {
+        cuda_improve_noise_modulus_switch_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            noise_reduction_key.modulus_switch_zeros.as_c_ptr(0),
+            lwe_dimension.to_lwe_size().0 as u32,
+            num_samples,
+            noise_reduction_key.num_zeros,
+            noise_reduction_key
+                .ms_input_variance
+                .get_modular_variance(modulus)
+                .value,
+            noise_reduction_key.ms_r_sigma_factor.0,
+            noise_reduction_key.ms_bound.0,
+            log_modulus,
+        );
+    }
+    streams.synchronize_one(0);
+}
+
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn cuda_modulus_switch_multi_bit_ciphertext_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &mut CudaVec<T>,
+    log_modulus: u32,
+    polynomial_size: u32,
+    grouping_factor: u32,
+) {
+    cuda_modulus_switch_multi_bit_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0].get(),
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_mut_c_ptr(0),
+        lwe_array_in.len() as u32,
+        log_modulus,
+        polynomial_size,
+        grouping_factor,
+    );
+}
+
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn cuda_modulus_switch_multi_bit_ciphertext_u128_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &mut CudaVec<T>,
+    log_modulus: u32,
+    polynomial_size: u32,
+    grouping_factor: u32,
+) {
+    cuda_modulus_switch_multi_bit_128(
+        streams.ptr[0],
+        streams.gpu_indexes[0].get(),
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_mut_c_ptr(0),
+        lwe_array_in.len() as u32,
+        log_modulus,
+        polynomial_size,
+        grouping_factor,
+    );
+}
+
 /// Addition of a vector of LWE ciphertexts
 ///
 /// # Safety
@@ -866,6 +951,30 @@ pub unsafe fn add_lwe_ciphertext_vector_plaintext_vector_assign_async<T: Unsigne
    );
 }

+/// Assigned subtraction of a vector of LWE ciphertexts with a vector of plaintexts
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+pub unsafe fn sub_lwe_ciphertext_vector_plaintext_vector_assign_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    plaintext_in: &CudaVec<T>,
+    lwe_dimension: LweDimension,
+    num_samples: u32,
+) {
+    cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0].get(),
+        lwe_array_out.as_mut_c_ptr(0),
+        lwe_array_out.as_mut_c_ptr(0),
+        plaintext_in.as_c_ptr(0),
+        lwe_dimension.0 as u32,
+        num_samples,
+    );
+}
+
 /// Negation of a vector of LWE ciphertexts
 ///
 /// # Safety
--- a/tfhe/src/high_level_api/tests/noise_distribution/encryption_noise.rs
+++ b/tfhe/src/high_level_api/tests/noise_distribution/encryption_noise.rs
@@ -1,3 +1,16 @@
+// use shortint::parameters::compact_public_key_only::p_fail_2_minus_64::ks_pbs::{
+//     V0_11_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64_ZKV1,
+//     V0_11_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64_ZKV1,
+// };
+// use shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::{
+//     V0_11_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64_ZKV1,
+//     V0_11_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64_ZKV1,
+// };
+use crate::shortint::parameters::{
+    CompactPublicKeyEncryptionParameters, ShortintKeySwitchingParameters,
+};
+use crate::shortint::ClassicPBSParameters;
+
 use crate::core_crypto::algorithms::lwe_encryption::decrypt_lwe_ciphertext;
 use crate::core_crypto::algorithms::test::noise_distribution::lwe_encryption_noise::lwe_compact_public_key_encryption_expected_variance;
 use crate::core_crypto::commons::dispersion::DispersionParameter;
@@ -11,10 +24,7 @@ use crate::shortint::parameters::test_params::{
    TEST_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    TEST_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 };
-use crate::shortint::parameters::{
-    AtomicPatternParameters, ClassicPBSParameters, CompactPublicKeyEncryptionParameters,
-    ShortintKeySwitchingParameters,
-};
+use crate::shortint::parameters::AtomicPatternParameters;
 use crate::{ClientKey, CompactCiphertextList, CompactPublicKey, ConfigBuilder, FheUint2};

 use rayon::prelude::*;
--- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs
@@ -197,7 +197,7 @@ impl CudaCompactCiphertextListExpander {
 // lwe_dimension and lwe_ciphertext_count at runtime, these and other attributes are pre-computed
 // and stored directly in the CudaVec structure.
 pub struct CudaFlattenedVecCompactCiphertextList {
-    d_flattened_vec: CudaVec<u64>,
+    pub(crate) d_flattened_vec: CudaVec<u64>,
    num_lwe_per_compact_list: Vec<u32>,
    pub(crate) data_info: Vec<DataKind>,
    is_boolean: Vec<bool>,
--- a/tfhe/src/integer/gpu/ciphertext/info.rs
+++ b/tfhe/src/integer/gpu/ciphertext/info.rs
@@ -1,6 +1,6 @@
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::parameters::AtomicPatternKind;
-use crate::shortint::{CarryModulus, MessageModulus};
+use crate::shortint::{CarryModulus, MaxNoiseLevel, MessageModulus};

 #[derive(Clone, Copy)]
 pub struct CudaBlockInfo {
@@ -15,6 +15,16 @@ impl CudaBlockInfo {
    pub fn carry_is_empty(&self) -> bool {
        self.degree.get() < self.message_modulus.0
    }
+
+    #[cfg_attr(any(feature = "noise-asserts", test), track_caller)]
+    pub fn set_noise_level(&mut self, noise_level: NoiseLevel, max_noise_level: MaxNoiseLevel) {
+        if cfg!(feature = "noise-asserts") || cfg!(test) {
+            max_noise_level.validate(noise_level).unwrap()
+        } else {
+            let _ = max_noise_level;
+        }
+        self.noise_level = noise_level;
+    }
 }

 #[derive(Clone)]
--- a/tfhe/src/integer/gpu/client_key/radix.rs
+++ b/tfhe/src/integer/gpu/client_key/radix.rs
@@ -28,6 +28,12 @@ impl RadixClientKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        // assert_eq!(
+        //     cks_params.encryption_key_choice,
+        //     EncryptionKeyChoice::Big,
+        //     "Compression is only compatible with ciphertext in post PBS dimension"
+        // );
+
        assert_eq!(
            self.parameters().encryption_key_choice(),
            EncryptionKeyChoice::Big,
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -1,4 +1,5 @@
 use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
@@ -13,7 +14,7 @@ use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
    compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async,
-    get_compression_size_on_gpu, get_decompression_size_on_gpu,
+    extract_glwe_async, get_compression_size_on_gpu, get_decompression_size_on_gpu,
 };
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::parameters::AtomicPatternKind;
@@ -96,6 +97,37 @@ impl Clone for CudaPackedGlweCiphertextList {
    }
 }

+impl CudaPackedGlweCiphertextList {
+    pub fn extract_glwe(
+        &self,
+        glwe_index: usize,
+        streams: &CudaStreams,
+    ) -> CudaGlweCiphertextList<u64> {
+        let mut output_cuda_glwe_list = CudaGlweCiphertextList::new(
+            self.glwe_dimension,
+            self.polynomial_size,
+            GlweCiphertextCount(1),
+            self.ciphertext_modulus,
+            streams,
+        );
+
+        unsafe {
+            extract_glwe_async(
+                streams,
+                &mut output_cuda_glwe_list.0.d_vec,
+                &self.data,
+                glwe_index as u32,
+                self.storage_log_modulus.0 as u32,
+                self.glwe_dimension,
+                self.polynomial_size,
+                self.bodies_count as u32,
+            );
+        }
+        streams.synchronize();
+        output_cuda_glwe_list
+    }
+}
+
 impl CudaCompressionKey {
    pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self {
        Self {
@@ -258,13 +290,13 @@ impl CudaDecompressionKey {
        end_block_index: usize,
        streams: &CudaStreams,
    ) -> Result<CudaRadixCiphertext, crate::Error> {
-        if self.message_modulus.0 != self.carry_modulus.0 {
-            return Err(crate::Error::new(format!(
-                "Tried to unpack values from a list where message modulus \
-                ({:?}) is != carry modulus ({:?}), this is not supported.",
-                self.message_modulus, self.carry_modulus,
-            )));
-        }
+        // if self.message_modulus.0 != self.carry_modulus.0 {
+        //     return Err(crate::Error::new(format!(
+        //         "Tried to unpack values from a list where message modulus \
+        //         ({:?}) is != carry modulus ({:?}), this is not supported.",
+        //         self.message_modulus, self.carry_modulus,
+        //     )));
+        // }

        if end_block_index >= packed_list.bodies_count {
            return Err(crate::Error::new(format!(
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -454,6 +454,43 @@ pub fn get_scalar_mul_integer_radix_kb_size_on_gpu(
    size_tracker
 }

+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_small_scalar_mul_integer_async(
+    streams: &CudaStreams,
+    lwe_array: &mut CudaRadixCiphertext,
+    small_scalar: u64,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array.d_blocks.0.d_vec.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut lwe_array_noise_levels = lwe_array
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi(
+        lwe_array,
+        &mut lwe_array_degrees,
+        &mut lwe_array_noise_levels,
+    );
+
+    cuda_small_scalar_multiplication_integer_64_inplace(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+        streams.len() as u32,
+        &mut cuda_ffi_lwe_array,
+        small_scalar,
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -727,6 +764,39 @@ pub fn get_decompression_size_on_gpu(
 ///
 /// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
 ///   is required
+pub unsafe fn extract_glwe_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    glwe_list: &CudaVec<T>,
+    glwe_index: u32,
+    storage_log_modulus: u32,
+    compression_glwe_dimension: GlweDimension,
+    compression_polynomial_size: PolynomialSize,
+    bodies_count: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_out.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_list.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    cuda_integer_extract_glwe_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes_ptr(),
+        lwe_array_out.as_mut_c_ptr(0),
+        glwe_list.as_c_ptr(0),
+        glwe_index,
+        storage_log_modulus,
+        compression_polynomial_size.0 as u32,
+        compression_glwe_dimension.0 as u32,
+        bodies_count,
+    );
+}
+
 pub unsafe fn unchecked_add_integer_radix_assign_async(
    streams: &CudaStreams,
    radix_lwe_left: &mut CudaRadixCiphertext,
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -60,6 +60,8 @@ mod vector_find;
 #[cfg(test)]
 mod tests_long_run;
 #[cfg(test)]
+mod tests_noise_distribution;
+#[cfg(test)]
 mod tests_signed;
 #[cfg(test)]
 mod tests_unsigned;
@@ -186,9 +188,12 @@ impl CudaServerKey {
            PBSOrder::BootstrapKeyswitch => self.key_switching_key.output_key_lwe_size(),
        };

+        let mut bits_per_block = self.message_modulus.0.ilog2();
+        if bits_per_block == 0 {
+            bits_per_block = 1;
+        }
        let decomposer =
-            BlockDecomposer::with_block_count(scalar, self.message_modulus.0.ilog2(), num_blocks)
-                .iter_as::<u64>();
+            BlockDecomposer::with_block_count(scalar, bits_per_block, num_blocks).iter_as::<u64>();
        let mut cpu_lwe_list = LweCiphertextList::new(
            0,
            lwe_size,
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
@@ -113,6 +113,10 @@ impl CudaServerKey {
            return;
        }

+        if decomposed_scalar.is_empty() {
+            return;
+        }
+
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
                unchecked_scalar_mul_integer_radix_kb_async(
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/atomic_pattern.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/atomic_pattern.rs
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
@@ -0,0 +1 @@
+pub mod atomic_pattern;
--- a/tfhe/src/shortint/list_compression/compression.rs
+++ b/tfhe/src/shortint/list_compression/compression.rs
@@ -133,13 +133,13 @@ impl DecompressionKey {
        packed: &CompressedCiphertextList,
        index: usize,
    ) -> Result<Ciphertext, crate::Error> {
-        if packed.message_modulus.0 != packed.carry_modulus.0 {
-            return Err(crate::Error::new(format!(
-                "Tried to unpack values from a list where message modulus \
-                ({:?}) is != carry modulus ({:?}), this is not supported.",
-                packed.message_modulus, packed.carry_modulus,
-            )));
-        }
+        // if packed.message_modulus.0 != packed.carry_modulus.0 {
+        //     return Err(crate::Error::new(format!(
+        //         "Tried to unpack values from a list where message modulus \
+        //         ({:?}) is != carry modulus ({:?}), this is not supported.",
+        //         packed.message_modulus, packed.carry_modulus,
+        //     )));
+        // }

        if index >= packed.count.0 {
            return Err(crate::Error::new(format!(
--- a/tfhe/src/shortint/list_compression/private_key.rs
+++ b/tfhe/src/shortint/list_compression/private_key.rs
@@ -30,14 +30,13 @@ impl ClientKey {
            EncryptionKeyChoice::Big,
            "Compression is only compatible with ciphertext in post PBS dimension"
        );
+        let mut engine = ShortintEngine::new();

-        let post_packing_ks_key = ShortintEngine::with_thread_local_mut(|engine| {
-            allocate_and_generate_new_binary_glwe_secret_key(
-                params.packing_ks_glwe_dimension,
-                params.packing_ks_polynomial_size,
-                &mut engine.secret_generator,
-            )
-        });
+        let post_packing_ks_key = allocate_and_generate_new_binary_glwe_secret_key(
+            params.packing_ks_glwe_dimension,
+            params.packing_ks_polynomial_size,
+            &mut engine.secret_generator,
+        );

        CompressionPrivateKeys {
            post_packing_ks_key,
--- a/tfhe/src/shortint/parameters/compact_public_key_only.rs
+++ b/tfhe/src/shortint/parameters/compact_public_key_only.rs
@@ -8,6 +8,7 @@ use crate::shortint::parameters::{
    ShortintParameterSet, SupportedCompactPkeZkScheme,
 };
 use crate::shortint::{KeySwitchingKeyView, PaddingBit, ShortintEncoding};
+
 use crate::Error;
 use serde::{Deserialize, Serialize};
 use tfhe_versionable::Versionize;
@@ -111,6 +112,9 @@ impl CompactPublicKeyEncryptionParameters {
            padding_bit: PaddingBit::Yes,
        }
    }
+    pub fn set_carry_modulus(&mut self, new_carry_modulus: CarryModulus) {
+        self.carry_modulus = new_carry_modulus;
+    }
 }

 impl TryFrom<ShortintParameterSet> for CompactPublicKeyEncryptionParameters {
--- a/tfhe/src/shortint/parameters/mod.rs
+++ b/tfhe/src/shortint/parameters/mod.rs
@@ -313,6 +313,33 @@ impl PBSParameters {
            Self::MultiBitPBS(params) => params.grouping_factor,
        }
    }
+    pub const fn log2_p_fail(&self) -> f64 {
+        match self {
+            Self::PBS(params) => params.log2_p_fail,
+            Self::MultiBitPBS(params) => params.log2_p_fail,
+        }
+    }
+
+    pub fn set_log2_p_fail(&mut self, new_p_fail: f64) {
+        match self {
+            Self::PBS(params) => params.log2_p_fail = new_p_fail,
+            Self::MultiBitPBS(params) => params.log2_p_fail = new_p_fail,
+        }
+    }
+
+    pub fn set_message_modulus(&mut self, new_message_modulus: MessageModulus) {
+        match self {
+            Self::PBS(params) => params.message_modulus = new_message_modulus,
+            Self::MultiBitPBS(params) => params.message_modulus = new_message_modulus,
+        }
+    }
+
+    pub fn set_carry_modulus(&mut self, new_carry_modulus: CarryModulus) {
+        match self {
+            Self::PBS(params) => params.carry_modulus = new_carry_modulus,
+            Self::MultiBitPBS(params) => params.carry_modulus = new_carry_modulus,
+        }
+    }

    pub const fn is_pbs(&self) -> bool {
        matches!(self, Self::PBS(_))
--- a/tfhe/src/shortint/public_key/compact.rs
+++ b/tfhe/src/shortint/public_key/compact.rs
@@ -314,38 +314,36 @@ impl CompactPublicKey {

        let encryption_noise_distribution = self.parameters.encryption_noise_distribution;

+        let mut engine = ShortintEngine::new();
+
        // No parallelism allowed
        #[cfg(all(feature = "__wasm_api", not(feature = "parallel-wasm-api")))]
        {
            use crate::core_crypto::prelude::encrypt_lwe_compact_ciphertext_list_with_compact_public_key;
-            ShortintEngine::with_thread_local_mut(|engine| {
-                encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
-                    &self.key,
-                    &mut ct_list,
-                    &plaintext_list,
-                    encryption_noise_distribution,
-                    encryption_noise_distribution,
-                    &mut engine.secret_generator,
-                    &mut engine.encryption_generator,
-                );
-            });
+            encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
+                &self.key,
+                &mut ct_list,
+                &plaintext_list,
+                encryption_noise_distribution,
+                encryption_noise_distribution,
+                &mut engine.secret_generator,
+                &mut engine.encryption_generator,
+            );
        }

        // Parallelism allowed
        #[cfg(any(not(feature = "__wasm_api"), feature = "parallel-wasm-api"))]
        {
            use crate::core_crypto::prelude::par_encrypt_lwe_compact_ciphertext_list_with_compact_public_key;
-            ShortintEngine::with_thread_local_mut(|engine| {
-                par_encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
-                    &self.key,
-                    &mut ct_list,
-                    &plaintext_list,
-                    encryption_noise_distribution,
-                    encryption_noise_distribution,
-                    &mut engine.secret_generator,
-                    &mut engine.encryption_generator,
-                );
-            });
+            par_encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
+                &self.key,
+                &mut ct_list,
+                &plaintext_list,
+                encryption_noise_distribution,
+                encryption_noise_distribution,
+                &mut engine.secret_generator,
+                &mut engine.encryption_generator,
+            );
        }

        let message_modulus = self.parameters.message_modulus;
--- a/tfhe/src/shortint/server_key/tests/mod.rs
+++ b/tfhe/src/shortint/server_key/tests/mod.rs
@@ -1,4 +1,5 @@
 pub mod modulus_switch_compression;
+pub mod noise_distribution;
 pub mod noise_level;
 pub mod parameterized_test;
 pub mod parameterized_test_bivariate_pbs_compliant;
--- a/tfhe/src/shortint/server_key/tests/noise_distribution/atomic_pattern.rs
+++ b/tfhe/src/shortint/server_key/tests/noise_distribution/atomic_pattern.rs
--- a/tfhe/src/shortint/server_key/tests/noise_distribution/mod.rs
+++ b/tfhe/src/shortint/server_key/tests/noise_distribution/mod.rs
@@ -0,0 +1,32 @@
+pub mod atomic_pattern;
+
+use crate::core_crypto::commons::dispersion::Variance;
+
+/// Return the [`Variance`] if a [`Ciphertext`](`crate::shortint::Ciphertext`) with input variance
+/// `input_variance` is multiplied by the provided `scalar`.
+pub fn scalar_multiplication_variance(input_variance: Variance, scalar: u64) -> Variance {
+    let multiplicative_factor = scalar.checked_pow(2).unwrap();
+    Variance(input_variance.0 * multiplicative_factor as f64)
+}
+
+pub fn should_use_one_key_per_sample() -> bool {
+    static ONE_KEY_PER_SAMPLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+
+    *ONE_KEY_PER_SAMPLE.get_or_init(|| {
+        std::env::var("NOISE_MEASUREMENT_USE_PER_SAMPLE_KEY").map_or(false, |val| {
+            let val = val.parse::<u32>();
+            val.map_or(false, |val| val != 0)
+        })
+    })
+}
+
+pub fn should_run_long_pfail_tests() -> bool {
+    static LONG_PFAIL_TESTS: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
+
+    *LONG_PFAIL_TESTS.get_or_init(|| {
+        std::env::var("NOISE_MEASUREMENT_LONG_PFAIL_TESTS").map_or(false, |val| {
+            let val = val.parse::<u32>();
+            val.map_or(false, |val| val != 0)
+        })
+    })
+}
Author	SHA1	Message	Date
Guillermo Oyarzun	35b3430404	chore(gpu): fix some pke bugs	2025-06-19 12:22:39 +02:00
Guillermo Oyarzun	a56a8107ab	chore(gpu): reduce the pool size	2025-06-17 13:20:58 +02:00
Guillermo Oyarzun	b322708cbd	chore(gpu): add drift to pke gpu tests	2025-06-16 17:36:11 +02:00
Guillermo Oyarzun	25da5ef721	chore(gpu): add pke gpu tests	2025-06-16 15:17:54 +02:00
Guillermo Oyarzun	43b95aaf7c	chore(gpu): fix input_br variance formula for multi-bit	2025-06-16 12:27:27 +02:00
Guillermo Oyarzun	cfc19d3bd2	chore(gpu): correct ms formula in pbs128	2025-06-16 12:27:27 +02:00
Guillermo Oyarzun	51427fc9ae	chore(gpu): add multiplication and extra logs to pbs128 test	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	a45baaa3d0	chore(gpu): fixes after rebase	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	a24adf528b	chore(gpu): use the m128 params	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	3ddf9bdba6	chore(gpu): fix pbs128 after rebasing	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	de4902fb9f	chore(gpu): add noise squashing params without using it	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	b89bca0d13	chore(gpu): fix pbs128 with old params	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	f0dc0e18ab	chore(gpu): add pbs128	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	98ca66581e	chore(gpu): fix errors after rebasing	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	c94ccc3a23	feat(gpu): fix bugs and add extra logs	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	7e6573a1d2	feat(gpu): fix compression tests after rebase	2025-06-16 12:27:23 +02:00
Guillermo Oyarzun	94ff21b089	feat(gpu): add noise checks with multi-bit pbs	2025-06-16 12:27:23 +02:00
Guillermo Oyarzun	71420f0d92	feat(gpu): add noise checks	2025-06-16 12:27:23 +02:00
Arthur Meyre	d81bd4ebd6	wip: long pfail runs	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	83d1d6a46c	wip: fix packing ks noise formula	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	ac693f97e1	wip: remove check that makes compression pfail test fail	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	d06656cfb4	wip: remove check that makes compression pfail test fail	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	472ea682ae	wip: update noise formulas	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	258524f5e2	chore(zk): add noise tests for zkv1	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	6b98865515	fix: new param naming	2025-06-16 12:27:20 +02:00
Arthur Meyre	e47731b1ee	fix(shortint): fix sample count in pbs128 pfail	2025-06-16 12:27:20 +02:00
Arthur Meyre	d1b9bc676d	test(shortint): add variance check after KS in classic PBS AP	2025-06-16 12:27:19 +02:00
Arthur Meyre	405952e323	test(shortint): add normality check after KS in classic AP	2025-06-16 12:27:19 +02:00
Arthur Meyre	7556a8e05f	fix(test): fix slighlty wrong log message	2025-06-16 12:27:19 +02:00
Arthur Meyre	ebda1426e4	fix(test): fix test parameters for PBS 128 Disable the bound check as our computation disagrees with the RO	2025-06-16 12:27:18 +02:00
Arthur Meyre	da91075b26	test(shortint): add pfail estimation after ms in compression	2025-06-16 12:27:18 +02:00
Arthur Meyre	f95eb2cf2c	wip: re-exported pbs 128 with symbolic mantissa - add pbs 128 params with a mantissa setting - noise with this settings is in line with RO prediction	2025-06-16 12:27:17 +02:00
Arthur Meyre	45da14c7dd	wip: add pbs 128 tests (noise and pfail)	2025-06-16 12:27:16 +02:00
Arthur Meyre	75e03ae800	feat(core): add pbs 128 noise formulas	2025-06-16 12:27:16 +02:00
Arthur Meyre	9976cbe1f2	test(shortint): add pfail measurement for full compression + AP	2025-06-16 12:27:15 +02:00
Arthur Meyre	a9006486e8	test(shortint): slightly change decryption logic to also get the padding bit out	2025-06-16 12:27:14 +02:00
Arthur Meyre	7f8778f178	test(shortint): add pfail check for first part of compression	2025-06-16 12:27:14 +02:00
Arthur Meyre	370e4ae2e6	test(shortint): add first part of the compression AP	2025-06-16 12:27:13 +02:00
Arthur Meyre	b6db9d8ba0	chore: rename helper function to match test usage	2025-06-16 12:27:12 +02:00
Arthur Meyre	54b139b1b5	feat(core): add noise formula for packing keyswitch	2025-06-16 12:27:12 +02:00
Arthur Meyre	d3ccf08f2c	test(shortint): add pfail test for PKE -> KS -> (DP -> KS Compute) -> MS	2025-06-16 12:27:11 +02:00
Arthur Meyre	6f1492766f	refactor(test): also accept PKE -> KS to big -> DP -> KS -> MS	2025-06-16 12:27:10 +02:00
Arthur Meyre	03fa607209	test(shortint): add noise check for PKE -> KS to small key + MS	2025-06-16 12:27:09 +02:00
Arthur Meyre	29ab6c0709	wip: hotfix for borrow mut error, this needs a design pass/reflection	2025-06-16 12:27:09 +02:00
Arthur Meyre	1c0b428cd3	feat: add noise formulas for TUniform 132 bits of security	2025-06-16 12:27:08 +02:00
Arthur Meyre	dab2d39749	test(hl): add noise check for CompactPublicKey encryption w/ TUniform param	2025-06-16 12:27:07 +02:00
Arthur Meyre	ea81ef5d15	test(shortint): add pfail test for the classic AP	2025-06-16 12:27:06 +02:00
Arthur Meyre	57a7a5a084	refactor: prepare code factorization for noise and pfail	2025-06-16 12:27:06 +02:00
Arthur Meyre	9010ded3d5	test: add shortint atomic pattern noise measurement	2025-06-16 12:27:05 +02:00
Arthur Meyre	1ebd2848ad	test(hl): test encryption noise of FheUint ciphertext in HL API	2025-06-16 12:27:04 +02:00
Nicolas Sarlin	6a1a024e6d	chore(zk)!: store inside the pke params the supported zk scheme BREAKING_CHANGE: - Zk for compact PKE now requires dedicated encryption parameters	2025-06-16 12:27:04 +02:00