chore(gpu): add a benchmark for 128-bit multi-bit noise squashing

- Also, remove the lut indexes concept from the 128-bit multi-bit pbs. It's assumed not to exist by the entire backend (as it doesn't for classical PBS). So to keep it here would be a bit error prone.
2026-01-09 14:47:56 -05:00 · 2025-09-04 12:40:32 -03:00
parent b566d78621
commit c78cc2d2e9
13 changed files with 259 additions and 213 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -1066,8 +1066,18 @@ template <typename InputTorus> struct int_noise_squashing_lut {
    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                   tmp_lwe_before_ks, gpu_memory_allocated);
    for (int i = 0; i < pbs_buffer.size(); i++) {
-      cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
-                                              &pbs_buffer[i]);
+      switch (params.pbs_type) {
+      case MULTI_BIT:
+        cleanup_cuda_multi_bit_programmable_bootstrap_128(
+            streams[i], gpu_indexes[i], &pbs_buffer[i]);
+        break;
+      case CLASSICAL:
+        cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
+                                                &pbs_buffer[i]);
+        break;
+      default:
+        PANIC("Cuda error (PBS): unknown PBS type. ")
+      }
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
    }
    if (lwe_aligned_gather_vec.size() > 0) {
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -47,12 +47,11 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride);

 void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
                                                       const uint32_t gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -347,18 +347,12 @@ void execute_pbs_async(
        auto current_lwe_input_indexes =
            get_variant_element(lwe_input_indexes, i);

-        int gpu_offset =
-            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
-        auto d_lut_vector_indexes =
-            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
-
        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
            streams[i], gpu_indexes[i], current_lwe_array_out,
-            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
-            current_lwe_array_in, current_lwe_input_indexes,
-            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, num_many_lut, lut_stride);
+            current_lwe_output_indexes, lut_vec[i], current_lwe_array_in,
+            current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i],
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            base_log, level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
      }
      break;
    case CLASSICAL:
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -120,8 +120,8 @@ template <typename InputTorus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -131,45 +131,45 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
  case 256:
    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 512:
    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 1024:
    host_multi_bit_programmable_bootstrap_128<InputTorus,
                                              AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 2048:
    host_multi_bit_programmable_bootstrap_128<InputTorus,
                                              AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 4096:
    host_multi_bit_programmable_bootstrap_128<InputTorus,
                                              AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -182,8 +182,8 @@ template <typename InputTorus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -194,46 +194,46 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 512:
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 1024:
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 2048:
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  case 4096:
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
+        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, grouping_factor, base_log, level_count, num_samples,
+        num_many_lut, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -245,12 +245,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    void const *lwe_array_in, void const *lwe_input_indexes,
+    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
@@ -263,7 +262,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
                  static_cast<const uint64_t *>(lwe_output_indexes),
                  static_cast<const __uint128_t *>(lut_vector),
-                  static_cast<const uint64_t *>(lut_vector_indexes),
                  static_cast<const uint64_t *>(lwe_array_in),
                  static_cast<const uint64_t *>(lwe_input_indexes),
                  static_cast<const __uint128_t *>(bootstrapping_key), buffer,
@@ -276,7 +274,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<const uint64_t *>(lut_vector_indexes),
        static_cast<const uint64_t *>(lwe_array_in),
        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -136,7 +136,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        const InputTorus *__restrict__ lwe_array_in,
        const InputTorus *__restrict__ lwe_input_indexes,
        const __uint128_t *__restrict__ lut_vector,
-        const InputTorus *__restrict__ lut_vector_indexes,
        __uint128_t *global_accumulator, double *global_accumulator_fft,
        uint32_t lwe_dimension, uint32_t glwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
@@ -169,8 +168,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  auto block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

-  auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
-                                      params::degree * (glwe_dimension + 1)];
+  auto block_lut_vector = lut_vector;

  auto global_slice =
      &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
@@ -368,7 +366,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        __uint128_t *lwe_array_out,
        const InputTorus *__restrict__ lwe_output_indexes,
        const __uint128_t *__restrict__ lut_vector,
-        const InputTorus *__restrict__ lut_vector_indexes,
        const InputTorus *__restrict__ lwe_array_in,
        const InputTorus *__restrict__ lwe_input_indexes,
        const double *__restrict__ keybundle_array, double *join_buffer,
@@ -409,8 +406,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  auto block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

-  auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
-                                      params::degree * (glwe_dimension + 1)];
+  auto block_lut_vector = lut_vector;

  auto block_join_buffer =
      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
@@ -591,8 +587,7 @@ __host__ void execute_compute_keybundle_128(
 template <typename InputTorus, class params, bool is_first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count) {
@@ -618,27 +613,26 @@ __host__ void execute_step_one_128(
    device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
        InputTorus, params, NOSM, is_first_iter>
        <<<grid_accumulate_step_one, thds, 0, stream>>>(
-            lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
-            global_accumulator, global_accumulator_fft, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count, d_mem,
+            lwe_array_in, lwe_input_indexes, lut_vector, global_accumulator,
+            global_accumulator_fft, lwe_dimension, glwe_dimension,
+            polynomial_size, base_log, level_count, d_mem,
            full_sm_accumulate_step_one);
  else if (max_shared_memory < full_sm_accumulate_step_one)
    device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
        InputTorus, params, PARTIALSM, is_first_iter>
        <<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
           stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
-                     lut_vector_indexes, global_accumulator,
-                     global_accumulator_fft, lwe_dimension, glwe_dimension,
-                     polynomial_size, base_log, level_count, d_mem,
-                     partial_sm_accumulate_step_one);
+                     global_accumulator, global_accumulator_fft, lwe_dimension,
+                     glwe_dimension, polynomial_size, base_log, level_count,
+                     d_mem, partial_sm_accumulate_step_one);
  else
    device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
        InputTorus, params, FULLSM, is_first_iter>
        <<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
           stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
-                     lut_vector_indexes, global_accumulator,
-                     global_accumulator_fft, lwe_dimension, glwe_dimension,
-                     polynomial_size, base_log, level_count, d_mem, 0);
+                     global_accumulator, global_accumulator_fft, lwe_dimension,
+                     glwe_dimension, polynomial_size, base_log, level_count,
+                     d_mem, 0);
  check_cuda_error(cudaGetLastError());
 }

@@ -691,8 +685,8 @@ template <typename InputTorus, class params>
 __host__ void host_multi_bit_programmable_bootstrap_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -717,14 +711,14 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
      if (is_first_iter) {
        execute_step_one_128<InputTorus, params, true>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
+            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            base_log, level_count);
      } else {
        execute_step_one_128<InputTorus, params, false>(
-            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-            lwe_input_indexes, buffer, num_samples, lwe_dimension,
-            glwe_dimension, polynomial_size, base_log, level_count);
+            stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+            buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            base_log, level_count);
      }

      if (is_last_iter) {
@@ -745,9 +739,8 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
 template <typename InputTorus, class params>
 __host__ void execute_cg_external_product_loop_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t *lwe_array_out,
-    InputTorus const *lwe_output_indexes,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes,
    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
@@ -780,46 +773,45 @@ __host__ void execute_cg_external_product_loop_128(
  auto global_accumulator = buffer->global_accumulator;
  auto join_buffer = buffer->global_join_buffer;

-  void *kernel_args[22];
+  void *kernel_args[21];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
-  kernel_args[3] = &lut_vector_indexes;
-  kernel_args[4] = &lwe_array_in;
-  kernel_args[5] = &lwe_input_indexes;
-  kernel_args[6] = &keybundle_fft;
-  kernel_args[7] = &join_buffer;
-  kernel_args[8] = &global_accumulator;
-  kernel_args[9] = &lwe_dimension;
-  kernel_args[10] = &glwe_dimension;
-  kernel_args[11] = &polynomial_size;
-  kernel_args[12] = &base_log;
-  kernel_args[13] = &level_count;
-  kernel_args[14] = &grouping_factor;
-  kernel_args[15] = &lwe_offset;
-  kernel_args[16] = &chunk_size;
-  kernel_args[17] = &keybundle_size_per_input;
-  kernel_args[18] = &d_mem;
-  kernel_args[20] = &num_many_lut;
-  kernel_args[21] = &lut_stride;
+  kernel_args[3] = &lwe_array_in;
+  kernel_args[4] = &lwe_input_indexes;
+  kernel_args[5] = &keybundle_fft;
+  kernel_args[6] = &join_buffer;
+  kernel_args[7] = &global_accumulator;
+  kernel_args[8] = &lwe_dimension;
+  kernel_args[9] = &glwe_dimension;
+  kernel_args[10] = &polynomial_size;
+  kernel_args[11] = &base_log;
+  kernel_args[12] = &level_count;
+  kernel_args[13] = &grouping_factor;
+  kernel_args[14] = &lwe_offset;
+  kernel_args[15] = &chunk_size;
+  kernel_args[16] = &keybundle_size_per_input;
+  kernel_args[17] = &d_mem;
+  kernel_args[19] = &num_many_lut;
+  kernel_args[20] = &lut_stride;

  dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count);
  dim3 thds(polynomial_size / params::opt, 1, 1);

  if (max_shared_memory < partial_dm) {
-    kernel_args[19] = &full_dm;
+    kernel_args[18] = &full_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            InputTorus, params, NOSM>,
        grid_accumulate, thds, (void **)kernel_args, 0, stream));
  } else if (max_shared_memory < full_dm) {
-    kernel_args[19] = &partial_dm;
+    kernel_args[18] = &partial_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            InputTorus, params, PARTIALSM>,
        grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
  } else {
-    kernel_args[19] = &no_dm;
+    kernel_args[18] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
            InputTorus, params, FULLSM>,
@@ -831,8 +823,8 @@ template <typename InputTorus, class params>
 __host__ void host_cg_multi_bit_programmable_bootstrap_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
+    __uint128_t const *bootstrapping_key,
    pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -851,11 +843,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap_128(

    // Accumulate
    execute_cg_external_product_loop_128<InputTorus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
-        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
-        lut_stride);
+        stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
+        lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        lwe_offset, num_many_lut, lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2698,7 +2698,6 @@ unsafe extern "C" {
        lwe_array_out: *mut ffi::c_void,
        lwe_output_indexes: *const ffi::c_void,
        lut_vector: *const ffi::c_void,
-        lut_vector_indexes: *const ffi::c_void,
        lwe_array_in: *const ffi::c_void,
        lwe_input_indexes: *const ffi::c_void,
        bootstrapping_key: *const ffi::c_void,
--- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -1,8 +1,12 @@
+#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
 #[cfg(feature = "gpu")]
 use benchmark::params_aliases::{
    BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 };
 #[cfg(not(feature = "gpu"))]
@@ -25,8 +29,12 @@ use tfhe::prelude::*;

 #[cfg(feature = "gpu")]
 use tfhe::core_crypto::gpu::get_number_of_gpus;
+use tfhe::shortint::parameters::{
+    CompressionParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters,
+};
+use tfhe::shortint::PBSParameters;
 #[cfg(feature = "gpu")]
-use tfhe::{set_server_key, GpuIndex};
+use tfhe::GpuIndex;
 use tfhe::{
    ClientKey, CompressedCiphertextListBuilder, CompressedServerKey,
    CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14,
@@ -36,18 +44,42 @@ use tfhe::{

 fn bench_sns_only_fhe_type<FheType>(
    c: &mut Criterion,
-    client_key: &ClientKey,
+    params: (
+        PBSParameters,
+        NoiseSquashingParameters,
+        NoiseSquashingCompressionParameters,
+        CompressionParameters,
+    ),
    type_name: &str,
    num_bits: usize,
 ) where
    FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
    FheType: SquashNoise,
 {
+    let (param, noise_param, _, _) = params;
+
+    use tfhe::{set_server_key, ConfigBuilder};
+    let config = ConfigBuilder::with_custom_parameters(param)
+        .enable_noise_squashing(noise_param)
+        .build();
+    let client_key = ClientKey::generate(config);
+    let compressed_sks = CompressedServerKey::new(&client_key);
+
+    #[cfg(feature = "gpu")]
+    set_server_key(compressed_sks.decompress_to_gpu());
+
+    #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+    {
+        let decompressed_sks = compressed_sks.decompress();
+        rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
+        set_server_key(decompressed_sks);
+    }
+
    let mut bench_group = c.benchmark_group(type_name);
    let bench_id_prefix = if cfg!(feature = "gpu") {
-        "hlapi::cuda"
+        format!("hlapi::cuda::{}", noise_param.name())
    } else {
-        "hlapi"
+        "hlapi".to_string()
    };
    let bench_id_suffix = format!("noise_squash::{type_name}");

@@ -60,9 +92,9 @@ fn bench_sns_only_fhe_type<FheType>(
            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");

            #[cfg(feature = "gpu")]
-            configure_gpu(client_key);
+            configure_gpu(&client_key);

-            let input = FheType::encrypt(rng.gen(), client_key);
+            let input = FheType::encrypt(rng.gen(), &client_key);

            bench_group.bench_function(&bench_id, |b| {
                b.iter(|| {
@@ -82,7 +114,7 @@ fn bench_sns_only_fhe_type<FheType>(
                bench_group.throughput(Throughput::Elements(elements));
                println!("elements: {elements}");
                let gpu_count = get_number_of_gpus() as usize;
-                let compressed_server_key = CompressedServerKey::new(client_key);
+                let compressed_server_key = CompressedServerKey::new(&client_key);
                let sks_vec = (0..gpu_count)
                    .map(|i| {
                        compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
@@ -92,7 +124,7 @@ fn bench_sns_only_fhe_type<FheType>(
                bench_group.bench_function(&bench_id, |b| {
                    let encrypt_values = || {
                        (0..elements)
-                            .map(|_| FheType::encrypt(rng.gen(), client_key))
+                            .map(|_| FheType::encrypt(rng.gen(), &client_key))
                            .collect::<Vec<_>>()
                    };

@@ -118,7 +150,7 @@ fn bench_sns_only_fhe_type<FheType>(
                bench_group.bench_function(&bench_id, |b| {
                    let encrypt_values = || {
                        (0..elements)
-                            .map(|_| FheType::encrypt(rng.gen(), client_key))
+                            .map(|_| FheType::encrypt(rng.gen(), &client_key))
                            .collect::<Vec<_>>()
                    };

@@ -150,7 +182,12 @@ fn bench_sns_only_fhe_type<FheType>(

 fn bench_decomp_sns_comp_fhe_type<FheType>(
    c: &mut Criterion,
-    client_key: &ClientKey,
+    params: (
+        PBSParameters,
+        NoiseSquashingParameters,
+        NoiseSquashingCompressionParameters,
+        CompressionParameters,
+    ),
    type_name: &str,
    num_bits: usize,
 ) where
@@ -158,11 +195,32 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
    FheType: SquashNoise + Tagged + HlExpandable + HlCompressible,
    <FheType as SquashNoise>::Output: HlSquashedNoiseCompressible,
 {
+    let (param, noise_param, comp_noise_param, comp_param) = params;
+
+    use tfhe::{set_server_key, ConfigBuilder};
+    let config = ConfigBuilder::with_custom_parameters(param)
+        .enable_noise_squashing(noise_param)
+        .enable_noise_squashing_compression(comp_noise_param)
+        .enable_compression(comp_param)
+        .build();
+    let client_key = ClientKey::generate(config);
+    let compressed_sks = CompressedServerKey::new(&client_key);
+
+    #[cfg(feature = "gpu")]
+    set_server_key(compressed_sks.decompress_to_gpu());
+
+    #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+    {
+        let decompressed_sks = compressed_sks.decompress();
+        rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
+        set_server_key(decompressed_sks);
+    }
+
    let mut bench_group = c.benchmark_group(type_name);
    let bench_id_prefix = if cfg!(feature = "gpu") {
-        "hlapi::cuda"
+        format!("hlapi::cuda::{}", noise_param.name())
    } else {
-        "hlapi"
+        "hlapi".to_string()
    };
    let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}");

@@ -175,9 +233,9 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");

            #[cfg(feature = "gpu")]
-            configure_gpu(client_key);
+            configure_gpu(&client_key);

-            let input = FheType::encrypt(rng.gen(), client_key);
+            let input = FheType::encrypt(rng.gen(), &client_key);

            let mut builder = CompressedCiphertextListBuilder::new();
            builder.push(input);
@@ -205,7 +263,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
                bench_group.throughput(Throughput::Elements(elements));
                println!("elements: {elements}");
                let gpu_count = get_number_of_gpus() as usize;
-                let compressed_server_key = CompressedServerKey::new(client_key);
+                let compressed_server_key = CompressedServerKey::new(&client_key);
                let sks_vec = (0..gpu_count)
                    .map(|i| {
                        compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
@@ -216,7 +274,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
                    let compressed_values = || {
                        (0..elements)
                            .map(|_| {
-                                let input = FheType::encrypt(rng.gen(), client_key);
+                                let input = FheType::encrypt(rng.gen(), &client_key);
                                let mut builder = CompressedCiphertextListBuilder::new();
                                builder.push(input);
                                builder.build().unwrap()
@@ -254,7 +312,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
                    let compressed_values = || {
                        (0..elements)
                            .map(|_| {
-                                let input = FheType::encrypt(rng.gen(), client_key);
+                                let input = FheType::encrypt(rng.gen(), &client_key);
                                let mut builder = CompressedCiphertextListBuilder::new();
                                builder.push(input);
                                builder.build().unwrap()
@@ -296,8 +354,10 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
 macro_rules! bench_sns_only_type {
    ($fhe_type:ident) => {
        ::paste::paste! {
-            fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
-                bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) {
+                for param in params {
+                    bench_sns_only_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits());
+                }
            }
        }
    };
@@ -306,8 +366,10 @@ macro_rules! bench_sns_only_type {
 macro_rules! bench_decomp_sns_comp_type {
    ($fhe_type:ident) => {
        ::paste::paste! {
-            fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
-                bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) {
+                for param in params {
+                bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits());
+    }
            }
        }
    };
@@ -330,65 +392,55 @@ bench_decomp_sns_comp_type!(FheUint64);
 fn main() {
    #[cfg(feature = "hpu")]
    panic!("Noise squashing is not supported on HPU");
-    #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
-    let cks = {
-        use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        use tfhe::{set_server_key, ConfigBuilder};
-        let config = ConfigBuilder::with_custom_parameters(
-            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
-        .enable_noise_squashing_compression(
-            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
-        .build();
-        let cks = ClientKey::generate(config);
-        let compressed_sks = CompressedServerKey::new(&cks);

-        let decompressed_sks = compressed_sks.decompress();
-        rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
-        set_server_key(decompressed_sks);
-        cks
-    };
-    #[cfg(feature = "gpu")]
-    let cks = {
-        use tfhe::{set_server_key, ConfigBuilder};
-        let config = ConfigBuilder::with_custom_parameters(
-            BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_noise_squashing(
-            BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_noise_squashing_compression(
-            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .enable_compression(
-            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        )
-        .build();
-        let cks = ClientKey::generate(config);
-        let compressed_sks = CompressedServerKey::new(&cks);
+    let params: Vec<(
+        PBSParameters,
+        NoiseSquashingParameters,
+        NoiseSquashingCompressionParameters,
+        CompressionParameters,
+    )> = {
+        #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+        {
+            vec![(
+                BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            )]
+        }

-        set_server_key(compressed_sks.decompress_to_gpu());
-        cks
+        #[cfg(feature = "gpu")]
+        {
+            vec![(
+                     BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                     BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                     BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                     BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                 ), (
+                     BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+                     BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                     BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                     BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                 ),
+            ]
+        }
    };

    let mut c = Criterion::default().configure_from_args();

-    bench_sns_only_fhe_uint2(&mut c, &cks);
-    bench_sns_only_fhe_uint4(&mut c, &cks);
-    bench_sns_only_fhe_uint6(&mut c, &cks);
-    bench_sns_only_fhe_uint8(&mut c, &cks);
-    bench_sns_only_fhe_uint10(&mut c, &cks);
-    bench_sns_only_fhe_uint12(&mut c, &cks);
-    bench_sns_only_fhe_uint14(&mut c, &cks);
-    bench_sns_only_fhe_uint16(&mut c, &cks);
-    bench_sns_only_fhe_uint32(&mut c, &cks);
-    bench_sns_only_fhe_uint64(&mut c, &cks);
-    bench_sns_only_fhe_uint128(&mut c, &cks);
+    bench_sns_only_fhe_uint2(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint4(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint6(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint8(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint10(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint12(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint14(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint16(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint32(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint64(&mut c, params.as_slice());
+    bench_sns_only_fhe_uint128(&mut c, params.as_slice());

-    bench_decomp_sns_comp_fhe_uint64(&mut c, &cks);
+    bench_decomp_sns_comp_fhe_uint64(&mut c, params.as_slice());

    c.final_summary();
 }
--- a/tfhe-benchmark/src/params_aliases.rs
+++ b/tfhe-benchmark/src/params_aliases.rs
@@ -7,6 +7,7 @@ pub mod shortint_params_aliases {
        ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
        MultiBitPBSParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters,
        ShortintKeySwitchingParameters,
+        NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    };
    // KS PBS Gaussian
    pub const BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
@@ -143,6 +144,10 @@ pub mod shortint_params_aliases {
        NoiseSquashingParameters =
        V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

+    pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        NoiseSquashingParameters =
+    NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
    pub const BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
        NoiseSquashingCompressionParameters =
        V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs
@@ -140,13 +140,8 @@ fn execute_multibit_bootstrap_u128(
                let d_accumulator =
                    CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);

-                let test_vector_indexes: Vec<u64> = vec![0; par_lwe_list.lwe_ciphertext_count().0];
-
-                let mut d_test_vector_indexes =
-                    unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
-                unsafe {
-                    d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0)
-                };
+                // We initialize it so cargo won't complain, but we don't use it internally
+                let d_test_vector_indexes = unsafe { CudaVec::<u64>::new_async(1, &stream, 0) };

                let num_blocks = d_lwe_ciphertext_in.lwe_ciphertext_count().0;
                let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -399,7 +399,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
            lwe_array_out.as_mut_c_ptr(0),
            output_indexes.as_c_ptr(0),
            test_vector.as_c_ptr(0),
-            test_vector_indexes.as_c_ptr(0),
            lwe_array_in.as_c_ptr(0),
            input_indexes.as_c_ptr(0),
            bootstrapping_key.as_c_ptr(0),
--- a/tfhe/src/high_level_api/tests/noise_squashing.rs
+++ b/tfhe/src/high_level_api/tests/noise_squashing.rs
@@ -4,9 +4,12 @@ use crate::high_level_api::{
 };
 use crate::integer::U256;
 use crate::set_server_key;
+#[cfg(feature = "gpu")]
 use crate::shortint::parameters::{
    NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
+use crate::shortint::parameters::{
    NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 };
--- a/tfhe/src/integer/gpu/server_key/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/mod.rs
@@ -32,8 +32,8 @@ pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
 impl<Scalar: UnsignedInteger> CudaBootstrappingKey<Scalar> {
    pub(crate) fn output_lwe_dimension(&self) -> LweDimension {
        match self {
-            CudaBootstrappingKey::Classic(bsk) => bsk.output_lwe_dimension(),
-            CudaBootstrappingKey::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(),
+            Self::Classic(bsk) => bsk.output_lwe_dimension(),
+            Self::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(),
        }
    }
 }
--- a/tfhe/src/shortint/keycache.rs
+++ b/tfhe/src/shortint/keycache.rs
@@ -487,7 +487,9 @@ fn cpke_params_default_name(params: &CompactPublicKeyEncryptionParameters) -> St
 }

 named_params_impl!( NoiseSquashingParameters =>
-    V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
 );

 named_params_impl!( NoiseSquashingCompressionParameters =>