feat(gpu): support keyswitch 64/32 in PBS

2026-01-06 21:34:05 -05:00 · 2025-10-10 17:25:03 +02:00
parent 78d1ce18c1
commit e43528db71
27 changed files with 1446 additions and 776 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -7,23 +7,23 @@ template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t max_shared_memory);

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory);

 #if CUDA_ARCH >= 900
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -48,7 +48,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_many_lut, uint32_t lut_stride);

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -96,7 +96,7 @@ template <typename Torus>
 uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
    uint32_t polynomial_size);

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
                            uint32_t polynomial_size, uint32_t glwe_dimension,
                            uint32_t level_count, uint64_t full_sm_keybundle);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -428,29 +428,29 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
  return buffer_size + buffer_size % sizeof(double2);
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
                                                   uint32_t num_samples,
                                                   uint32_t max_shared_memory);

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -458,11 +458,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t lut_stride);

 #if (CUDA_ARCH >= 900)
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -477,14 +477,14 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 #endif

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -69,13 +69,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

+uint64_t scratch_cuda_programmable_bootstrap_32_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
 uint64_t scratch_cuda_programmable_bootstrap_128(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
@@ -84,7 +90,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);

-void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -4,12 +4,12 @@
 #include "pbs_enums.h"
 #include "stdint.h"

-extern "C" {
-
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t num_samples, uint32_t max_shared_memory);

+extern "C" {
+
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
@@ -20,6 +20,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -35,6 +40,16 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -535,7 +535,6 @@ __host__ void host_gemm_keyswitch_lwe_ciphertext_vector(
  dim3 grid_negate(CEIL_DIV(lwe_dimension_out + 1, BLOCK_SIZE_DECOMP),
                   CEIL_DIV(num_samples, BLOCK_SIZE_DECOMP));
  dim3 threads_negate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
-
  // Negate all outputs in the output LWEs. This is the final step in the GEMM
  // keyswitch computed as: -(-b + sum(a_i A_KSK))
  keyswitch_negate_with_output_indices<Torus, KSTorus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -263,42 +263,7 @@ void execute_pbs_async(CudaStreams streams,
                       uint32_t num_many_lut, uint32_t lut_stride) {

  if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
-    // 32 bits
-    switch (pbs_type) {
-    case MULTI_BIT:
-      PANIC("Error: 32-bit multibit PBS is not supported.\n")
-    case CLASSICAL:
-      for (uint i = 0; i < streams.count(); i++) {
-        int num_inputs_on_gpu = get_num_inputs_on_gpu(
-            input_lwe_ciphertext_count, i, streams.count());
-
-        int gpu_offset =
-            get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
-        auto d_lut_vector_indexes =
-            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
-
-        // Use the macro to get the correct elements for the current iteration
-        // Handles the case when the input/output are scattered through
-        // different gpus and when it is not
-        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
-        auto current_lwe_output_indexes =
-            get_variant_element(lwe_output_indexes, i);
-        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
-        auto current_lwe_input_indexes =
-            get_variant_element(lwe_input_indexes, i);
-
-        cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-            streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
-            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
-            current_lwe_array_in, current_lwe_input_indexes,
-            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            num_many_lut, lut_stride);
-      }
-      break;
-    default:
-      PANIC("Error: unsupported cuda PBS type.")
-    }
+    PANIC("Error: unsupported 32b CUDA PBS type.")
  } else if constexpr (std::is_same_v<OutputTorus, uint64_t>) {
    // 64 bits
    switch (pbs_type) {
@@ -353,7 +318,7 @@ void execute_pbs_async(CudaStreams streams,
        auto d_lut_vector_indexes =
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);

-        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
            streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
 *
 * Each y-block computes one element of the lwe_array_out.
 */
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_cg(
    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    const Torus *__restrict__ lut_vector,
    const Torus *__restrict__ lut_vector_indexes,
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes,
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -79,7 +80,7 @@ __global__ void device_programmable_bootstrap_cg(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -95,8 +96,8 @@ __global__ void device_programmable_bootstrap_cg(

  // Put "b" in [0, 2N[
  constexpr auto log_modulus = params::log2_degree + 1;
-  Torus b_hat = 0;
-  Torus correction = 0;
+  InputTorus b_hat = 0;
+  InputTorus correction = 0;
  if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
    correction = centered_binary_modulus_switch_body_correction_to_add(
        block_lwe_array_in, lwe_dimension, log_modulus);
@@ -113,7 +114,7 @@ __global__ void device_programmable_bootstrap_cg(
    __syncthreads();

    // Put "a" in [0, 2N[
-    Torus a_hat = 0;
+    InputTorus a_hat = 0;
    modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
@@ -193,7 +194,7 @@ __global__ void device_programmable_bootstrap_cg(
  }
 }

-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_cg(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -210,18 +211,18 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
@@ -237,11 +238,11 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
 /*
 * Host wrapper
 */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_programmable_bootstrap_cg(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -293,26 +294,29 @@ __host__ void host_programmable_bootstrap_cg(
  if (max_shared_memory < partial_sm) {
    kernel_args[13] = &full_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, grid,
-        thds, (void **)kernel_args, 0, stream));
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
+        grid, thds, (void **)kernel_args, 0, stream));
  } else if (max_shared_memory < full_sm) {
    kernel_args[13] = &partial_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        (void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
+                                                 PARTIALSM>,
        grid, thds, (void **)kernel_args, partial_sm, stream));
  } else {
    int no_dm = 0;
    kernel_args[13] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, grid,
-        thds, (void **)kernel_args, full_sm, stream));
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
+        grid, thds, (void **)kernel_args, full_sm, stream));
  }

  check_cuda_error(cudaGetLastError());
 }

 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
    int glwe_dimension, int level_count, int num_samples,
    uint32_t max_shared_memory) {
@@ -338,30 +342,34 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
  if (max_shared_memory < partial_sm) {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, thds, 0);
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
+        thds, 0);
  } else if (max_shared_memory < full_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        (void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
+                                                 PARTIALSM>,
        thds, partial_sm);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
        cudaFuncCachePreferShared);
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, thds,
-        full_sm);
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
+        thds, full_sm);
    check_cuda_error(cudaGetLastError());
  }

@@ -372,39 +380,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 }

 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
    uint32_t max_shared_memory) {
  switch (polynomial_size) {
  case 256:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 512:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 1024:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 2048:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<2048>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 4096:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 8192:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 16384:
    return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
          "Supported N's are powers of two"
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -18,13 +18,14 @@
 #include "types/complex/operations.cuh"
 #include <vector>

-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_multi_bit_programmable_bootstrap_cg_accumulate(
        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
        const double2 *__restrict__ keybundle_array, double2 *join_buffer,
        Torus *global_accumulator, uint32_t lwe_dimension,
@@ -60,7 +61,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -80,7 +81,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                   params::log2_degree + 1);

@@ -208,7 +209,7 @@ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
  return buffer_size + buffer_size % sizeof(double2);
 }

-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -230,57 +231,59 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory < full_sm_keybundle) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }

  if (max_shared_memory < partial_sm_cg_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              NOSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              NOSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory < full_sm_cg_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+  auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
      gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
      level_count, full_sm_keybundle);
  uint64_t size_tracker = 0;
@@ -291,10 +294,10 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
  return size_tracker;
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void execute_cg_external_product_loop(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -359,28 +362,28 @@ __host__ void execute_cg_external_product_loop(
    kernel_args[19] = &full_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, NOSM>,
+            InputTorus, Torus, params, NOSM>,
        grid_accumulate, thds, (void **)kernel_args, 0, stream));
  } else if (max_shared_memory < full_dm) {
    kernel_args[19] = &partial_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, PARTIALSM>,
+            InputTorus, Torus, params, PARTIALSM>,
        grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
  } else {
    kernel_args[19] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, FULLSM>,
+            InputTorus, Torus, params, FULLSM>,
        grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
  }
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_cg_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -393,13 +396,13 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
       lwe_offset += lwe_chunk_size) {

    // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
+    execute_compute_keybundle<InputTorus, Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
        grouping_factor, level_count, lwe_offset);

    // Accumulate
-    execute_cg_external_product_loop<Torus, params>(
+    execute_cg_external_product_loop<InputTorus, Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
@@ -409,7 +412,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
 }

 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
    int glwe_dimension, int level_count, int num_samples,
    uint32_t max_shared_memory) {
@@ -436,36 +439,36 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, NOSM>,
+            InputTorus, Torus, params, NOSM>,
        thds, 0);
  } else if (max_shared_memory < full_sm_cg_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, PARTIALSM>,
+            InputTorus, Torus, params, PARTIALSM>,
        thds, partial_sm_cg_accumulate);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
        cudaFuncCachePreferShared);
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, FULLSM>,
+            InputTorus, Torus, params, FULLSM>,
        thds, full_sm_cg_accumulate);
    check_cuda_error(cudaGetLastError());
  }
@@ -478,39 +481,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(

 // Verify if the grid size for the multi-bit kernel satisfies the cooperative
 // group constraints
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
    uint32_t max_shared_memory) {
  switch (polynomial_size) {
  case 256:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 512:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 1024:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 2048:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<2048>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 4096:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 8192:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 16384:
    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
          "N's are powers of two"
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -7,18 +7,19 @@

 #include <stdio.h>

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
                                                   uint32_t num_samples,
                                                   uint32_t max_shared_memory) {
-  return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
+  return supports_cooperative_groups_on_programmable_bootstrap<InputTorus,
+                                                               Torus>(
      glwe_dimension, polynomial_size, level_count, num_samples,
      max_shared_memory);
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory) {
@@ -28,38 +29,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
  switch (polynomial_size) {
  case 256:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
  case 512:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
  case 1024:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 2048:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, Degree<2048>>(num_samples, glwe_dimension, polynomial_size,
-                             level_count, max_shared_memory);
+        InputTorus, Torus, Degree<2048>>(num_samples, glwe_dimension,
+                                         polynomial_size, level_count,
+                                         max_shared_memory);
  case 4096:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 8192:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 16384:
    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
+                                                   polynomial_size, level_count,
+                                                   max_shared_memory);
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
          "N's are powers of two"
@@ -71,7 +73,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
 }

 #if (CUDA_ARCH >= 900)
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap_tbc(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -80,37 +82,43 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(

  switch (polynomial_size) {
  case 256:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 512:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 1024:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 2048:
-    return scratch_programmable_bootstrap_tbc<Torus, Degree<2048>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 4096:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 8192:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 16384:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -121,11 +129,11 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
  }
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -134,7 +142,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -142,7 +150,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -150,7 +158,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -158,7 +166,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_programmable_bootstrap_tbc<Torus, Degree<2048>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -166,7 +174,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 4096:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -174,7 +182,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 8192:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -182,7 +190,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 16384:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -197,7 +205,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 }
 #endif

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -206,37 +214,44 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(

  switch (polynomial_size) {
  case 256:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 512:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 1024:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 2048:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 4096:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 8192:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 16384:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -247,7 +262,7 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
  }
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -256,37 +271,44 @@ uint64_t scratch_cuda_programmable_bootstrap(

  switch (polynomial_size) {
  case 256:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 512:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 1024:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 2048:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 4096:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 8192:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  case 16384:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -299,11 +321,10 @@ uint64_t scratch_cuda_programmable_bootstrap(

 /*
 * This scratch function allocates the necessary amount of data on the GPU for
- * the classical PBS on 32 bits inputs, into `buffer`. It also
- * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
- * be used.
+ * the PBS on 64 bits inputs, into `buffer`. It also configures SM options on
+ * the GPU in case FULLSM or PARTIALSM mode is going to be used.
 */
-uint64_t scratch_cuda_programmable_bootstrap_32(
+uint64_t scratch_cuda_programmable_bootstrap_32_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
@@ -311,25 +332,25 @@ uint64_t scratch_cuda_programmable_bootstrap_32(

  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
-  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
+  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
          level_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
-        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
+    return scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  else
 #endif
-      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
+      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
              glwe_dimension, polynomial_size, level_count,
              input_lwe_ciphertext_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_cg<uint32_t>(
-        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
+    return scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  else
-    return scratch_cuda_programmable_bootstrap<uint32_t>(
-        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
+    return scratch_cuda_programmable_bootstrap<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
 }
@@ -347,34 +368,34 @@ uint64_t scratch_cuda_programmable_bootstrap_64(

  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
-  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
          level_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
+    return scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  else
 #endif
-      if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
+      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
              glwe_dimension, polynomial_size, level_count,
              input_lwe_ciphertext_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
+    return scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
  else
-    return scratch_cuda_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -383,7 +404,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -391,7 +412,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -399,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -407,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -415,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 4096:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -423,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 8192:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -431,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 16384:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -445,11 +466,11 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
  }
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -458,7 +479,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -466,7 +487,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -474,7 +495,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -482,7 +503,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -490,7 +511,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 4096:
-    host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -498,7 +519,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 8192:
-    host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -506,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 16384:
-    host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -522,7 +543,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(

 /* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
 */
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
@@ -530,23 +551,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")

-  if (base_log > 32)
-    PANIC("Cuda error (classical PBS): base log should be <= 32")
+  pbs_buffer<uint64_t, CLASSICAL> *buffer =
+      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;

-  pbs_buffer<uint32_t, CLASSICAL> *buffer =
-      (pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;
+  check_cuda_error(cudaGetLastError());

  switch (buffer->pbs_variant) {
-  case TBC:
-#if CUDA_ARCH >= 900
-    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
-        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-        static_cast<const uint32_t *>(lwe_output_indexes),
-        static_cast<const uint32_t *>(lut_vector),
-        static_cast<const uint32_t *>(lut_vector_indexes),
+  case PBS_VARIANT::TBC:
+#if (CUDA_ARCH >= 900)
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
        static_cast<const uint32_t *>(lwe_array_in),
-        static_cast<const uint32_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_many_lut, lut_stride);
@@ -554,26 +576,26 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
-  case CG:
-    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
-        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-        static_cast<const uint32_t *>(lwe_output_indexes),
-        static_cast<const uint32_t *>(lut_vector),
-        static_cast<const uint32_t *>(lut_vector_indexes),
+  case PBS_VARIANT::CG:
+    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t, uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
        static_cast<const uint32_t *>(lwe_array_in),
-        static_cast<const uint32_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_many_lut, lut_stride);
    break;
-  case DEFAULT:
-    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
-        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-        static_cast<const uint32_t *>(lwe_output_indexes),
-        static_cast<const uint32_t *>(lut_vector),
-        static_cast<const uint32_t *>(lut_vector_indexes),
+  case PBS_VARIANT::DEFAULT:
+    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t, uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
        static_cast<const uint32_t *>(lwe_array_in),
-        static_cast<const uint32_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_many_lut, lut_stride);
@@ -644,7 +666,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
 * 	- the constant memory (64K) is used for storing the roots of identity
 * values for the FFT
 */
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
@@ -663,7 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
  switch (buffer->pbs_variant) {
  case PBS_VARIANT::TBC:
 #if (CUDA_ARCH >= 900)
-    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
@@ -678,7 +700,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
  case PBS_VARIANT::CG:
-    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
@@ -690,7 +712,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        num_many_lut, lut_stride);
    break;
  case PBS_VARIANT::DEFAULT:
-    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
@@ -705,7 +727,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    PANIC("Cuda error (PBS): unknown pbs variant.")
  }
 }
-
 /*
 * This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
 * bits inputs.
@@ -718,11 +739,16 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
  *buffer = nullptr;
 }

-template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
+template bool has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t num_samples, uint32_t max_shared_memory);

-template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
+template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory);
+
+template void
+cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
    uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -732,7 +758,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

-template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+template void
+cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
    uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -742,14 +769,14 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

-template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t>(
+template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-template uint64_t scratch_cuda_programmable_bootstrap<uint64_t>(
+template uint64_t scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
@@ -765,38 +792,31 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);

-template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
-    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
-    uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
-    uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
-    uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
-    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
-template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t>(
+template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
    void *stream, uint32_t gpu_index,
-    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
+    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-template uint64_t scratch_cuda_programmable_bootstrap<uint32_t>(
+template uint64_t scratch_cuda_programmable_bootstrap<uint32_t, uint32_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
+template bool
+has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory);
-template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+template bool
+has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory);

 #if CUDA_ARCH >= 900
-template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
+template void
+cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint32_t>(
    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
    uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
    uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
@@ -805,7 +825,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);
-template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+template void
+cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
    uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -814,18 +835,35 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
    uint32_t lut_stride);
-template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
+template void
+cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
+    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
+    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
+    uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
+    uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
+    pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint32_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);
-template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
+template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);
+template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
 template bool
 supports_distributed_shared_memory_on_classic_programmable_bootstrap<
    __uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -17,12 +17,13 @@
 #include "polynomial/polynomial_math.cuh"
 #include "types/complex/operations.cuh"

-template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD, bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_programmable_bootstrap_step_one(
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes, Torus *global_accumulator,
        double2 *global_join_buffer, uint32_t lwe_iteration,
        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -55,7 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -75,8 +76,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // First iteration
    // Put "b" in [0, 2N[
    constexpr auto log_modulus = params::log2_degree + 1;
-    Torus b_hat = 0;
-    Torus correction = 0;
+    InputTorus b_hat = 0;
+    InputTorus correction = 0;
    if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
      correction = centered_binary_modulus_switch_body_correction_to_add(
          block_lwe_array_in, lwe_dimension, log_modulus);
@@ -86,6 +87,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

    // The y-dimension is used to select the element of the GLWE this block will
    // compute
+    // b_hat is cast from InputTorus (i.e. 64b or 32b) to uint32_t here
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
@@ -100,7 +102,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  }

  // Put "a" in [0, 2N[
-  Torus a_hat = 0;
+  InputTorus a_hat = 0;
  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
                 params::log2_degree + 1); // 2 * params::log2_degree + 1);

@@ -307,7 +309,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
  return buffer_size + buffer_size % sizeof(double2);
 }

-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -331,30 +333,38 @@ __host__ uint64_t scratch_programmable_bootstrap(
  // Configure step one
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
@@ -398,10 +408,10 @@ __host__ uint64_t scratch_programmable_bootstrap(
  return size_tracker;
 }

-template <typename Torus, class params, bool first_iter>
+template <typename InputTorus, typename Torus, class params, bool first_iter>
 __host__ void execute_step_one(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    Torus *global_accumulator, double2 *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
@@ -416,21 +426,24 @@ __host__ void execute_step_one(
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);

  if (max_shared_memory < partial_sm) {
-    device_programmable_bootstrap_step_one<Torus, params, NOSM, first_iter>
+    device_programmable_bootstrap_step_one<InputTorus, Torus, params, NOSM,
+                                           first_iter>
        <<<grid, thds, 0, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            global_accumulator, global_join_buffer, lwe_iteration,
            lwe_dimension, polynomial_size, base_log, level_count, d_mem,
            full_dm, noise_reduction_type);
  } else if (max_shared_memory < full_sm) {
-    device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
+    device_programmable_bootstrap_step_one<InputTorus, Torus, params, PARTIALSM,
+                                           first_iter>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            global_accumulator, global_join_buffer, lwe_iteration,
            lwe_dimension, polynomial_size, base_log, level_count, d_mem,
            partial_dm, noise_reduction_type);
  } else {
-    device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
+    device_programmable_bootstrap_step_one<InputTorus, Torus, params, FULLSM,
+                                           first_iter>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            global_accumulator, global_join_buffer, lwe_iteration,
@@ -484,11 +497,11 @@ __host__ void execute_step_two(
 /*
 * Host wrapper to the programmable bootstrap
 */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -520,7 +533,7 @@ __host__ void host_programmable_bootstrap(

  for (int i = 0; i < lwe_dimension; i++) {
    if (i == 0) {
-      execute_step_one<Torus, params, true>(
+      execute_step_one<InputTorus, Torus, params, true>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, bootstrapping_key, global_accumulator,
          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
@@ -528,7 +541,7 @@ __host__ void host_programmable_bootstrap(
          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
          noise_reduction_type);
    } else {
-      execute_step_one<Torus, params, false>(
+      execute_step_one<InputTorus, Torus, params, false>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, bootstrapping_key, global_accumulator,
          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -7,15 +7,16 @@
 #include "programmable_bootstrap_tbc_multibit.cuh"
 #endif

+// Used in CPP benchmarks, not used in rust code
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t num_samples, uint32_t max_shared_memory) {
  return supports_cooperative_groups_on_multibit_programmable_bootstrap<
-      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
-                max_shared_memory);
+      uint64_t, uint64_t>(glwe_dimension, polynomial_size, level_count,
+                          num_samples, max_shared_memory);
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory) {
@@ -26,39 +27,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
  switch (polynomial_size) {
  case 256:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
  case 512:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
  case 1024:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 2048:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 4096:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 8192:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
  case 16384:
    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
+                                                   polynomial_size, level_count,
+                                                   max_shared_memory);
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
          "N's are powers of two"
@@ -69,11 +70,11 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
 #endif
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -82,7 +83,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -90,7 +92,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 512:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -98,7 +101,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 1024:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -106,7 +110,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 2048:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -114,7 +119,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 4096:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -122,7 +128,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 8192:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -130,7 +137,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 16384:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -144,11 +152,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
  }
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -157,7 +165,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -165,7 +174,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 512:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -173,7 +183,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 1024:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -181,7 +192,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 2048:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -189,7 +201,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 4096:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -197,7 +210,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 8192:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -205,7 +219,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 16384:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -239,7 +254,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
  switch (buffer->pbs_variant) {
  case PBS_VARIANT::TBC:
 #if CUDA_ARCH >= 900
-    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                    uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
@@ -254,7 +270,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
 #endif
  case PBS_VARIANT::CG:
-    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                   uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
@@ -266,7 +283,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        num_samples, num_many_lut, lut_stride);
    break;
  case PBS_VARIANT::DEFAULT:
-    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                uint64_t>(
        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
@@ -282,7 +300,73 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
  }
 }

-template <typename Torus>
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
+
+  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
+      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::TBC:
+#if CUDA_ARCH >= 900
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                    uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint32_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+#else
+    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
+#endif
+  case PBS_VARIANT::CG:
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                   uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint32_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint32_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -290,43 +374,43 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(

  switch (polynomial_size) {
  case 256:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 8192:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 16384:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                       AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -338,7 +422,7 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
  }
 }

-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -346,43 +430,43 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap(

  switch (polynomial_size) {
  case 256:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 8192:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 16384:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                    AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -400,7 +484,8 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  bool supports_cg =
-      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t,
+                                                                     uint64_t>(
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
 #if (CUDA_ARCH >= 900)
@@ -412,24 +497,70 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));

  bool supports_tbc =
-      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
+                                                               uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
          level_count, cuda_get_max_shared_memory(gpu_index));

  if (supports_tbc)
-    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t,
+                                                             uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
      if (supports_cg)
-    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
-    return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint32_t,
+                                                                     uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
+#if (CUDA_ARCH >= 900)
+  // On H100s we should be using TBC until num_samples < num_sms / 2.
+  // After that we switch to CG until not supported anymore.
+  // At this point we return to TBC.
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  bool supports_tbc =
+      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t,
+                                                               uint64_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          level_count, cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_tbc)
+    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint32_t,
+                                                             uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+#endif
+      if (supports_cg)
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+    return scratch_cuda_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory);
@@ -455,7 +586,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 * The value 13 was empirically determined based on memory requirements for
 * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
 */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
                            uint32_t polynomial_size, uint32_t glwe_dimension,
                            uint32_t level_count, uint64_t full_sm_keybundle) {
@@ -466,13 +597,14 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
  if (max_shared_memory < full_sm_keybundle)
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_blocks_per_sm,
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        polynomial_size / params::opt, full_sm_keybundle);
  else
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_blocks_per_sm,
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        polynomial_size / params::opt, 0);

  int num_sms = 0;
@@ -535,14 +667,15 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
  return (max_num_chunks > divisor) ? divisor : max_num_chunks;
 }

-template uint64_t scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
+template uint64_t
+scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
-cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
    uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -552,14 +685,16 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_many_lut, uint32_t lut_stride);

-template uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
+template uint64_t
+scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
-cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                               uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
    uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -570,12 +705,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint32_t num_many_lut, uint32_t lut_stride);

 template bool
-has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t, uint64_t>(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory);

 #if (CUDA_ARCH >= 900)
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -583,43 +718,43 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(

  switch (polynomial_size) {
  case 256:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 8192:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 16384:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -630,11 +765,11 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
          " in the interval [256..16384].")
  }
 }
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -646,7 +781,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

  switch (polynomial_size) {
  case 256:
-    host_tbc_multi_bit_programmable_bootstrap<uint64_t, AmortizedDegree<256>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -654,7 +790,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 512:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -662,7 +799,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 1024:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -675,14 +813,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));

    if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
-      host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+      host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                                AmortizedDegree<2048>>(
          static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
          lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
          lwe_dimension, polynomial_size, grouping_factor, base_log,
          level_count, num_samples, num_many_lut, lut_stride);
    else
-      host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
+      host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                                Degree<2048>>(
          static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
          lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -692,7 +832,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    break;
  }
  case 4096:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -700,7 +841,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 8192:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -708,7 +850,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        num_samples, num_many_lut, lut_stride);
    break;
  case 16384:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -722,13 +865,15 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
  }
 }

-template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
+template uint64_t
+scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
-cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
    uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -737,4 +882,16 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_many_lut, uint32_t lut_stride);
+
+template void
+cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                uint64_t>(
+    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
+    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
+    uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
+    uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
+    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -25,9 +25,10 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
         level_count;
 }

-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
    const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -55,7 +56,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {

-    const Torus *block_lwe_array_in =
+    const InputTorus *block_lwe_array_in =
        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];

    double2 *keybundle = keybundle_array +
@@ -86,10 +87,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    // Precalculate the monomial degrees and store them in shared memory
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
    if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
+      const InputTorus *lwe_array_group =
          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      monomial_degrees[threadIdx.x] =
+          calculates_monomial_degree<InputTorus, params>(
+              lwe_array_group, threadIdx.x, grouping_factor);
    }
    __syncthreads();

@@ -145,9 +147,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 // Then we can just calculate the offset needed to apply this coefficients, and
 // the operation transforms into a pointwise vector multiplication, avoiding to
 // perform extra instructions other than MADD
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
    const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
    uint32_t lwe_offset, uint64_t lwe_chunk_size,
@@ -188,7 +191,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {

-    const Torus *block_lwe_array_in =
+    const InputTorus *block_lwe_array_in =
        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];

    double2 *keybundle = keybundle_array +
@@ -219,10 +222,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
    uint32_t *monomial_degrees = (uint32_t *)selected_memory;

    if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
+      const InputTorus *lwe_array_group =
          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      monomial_degrees[threadIdx.x] =
+          calculates_monomial_degree<InputTorus, params>(
+              lwe_array_group, threadIdx.x, grouping_factor);
    }
    __syncthreads();

@@ -269,10 +273,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
  }
 }

-template <typename Torus, class params, sharedMemDegree SMD, bool is_first_iter>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD, bool is_first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_multi_bit_programmable_bootstrap_accumulate_step_one(
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
@@ -305,7 +310,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  if constexpr (SMD == PARTIALSM)
    accumulator_fft = (double2 *)sharedmem;

-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -327,7 +332,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    ////////////////////////////////////////////////////////////
    // Initializes the accumulator with the body of LWE
    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                   params::log2_degree + 1);

@@ -501,7 +506,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
  return sizeof(Torus) * polynomial_size; // accumulator
 }

-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -526,20 +531,22 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(

  if (max_shared_memory < full_sm_keybundle) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
@@ -547,59 +554,59 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
  if (max_shared_memory < partial_sm_accumulate_step_one) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, false>,
+            InputTorus, Torus, params, NOSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, false>,
+            InputTorus, Torus, params, NOSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, true>,
+            InputTorus, Torus, params, NOSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, true>,
+            InputTorus, Torus, params, NOSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory < full_sm_accumulate_step_one) {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, false>,
+            InputTorus, Torus, params, PARTIALSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        partial_sm_accumulate_step_one));
    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, false>,
+            InputTorus, Torus, params, PARTIALSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, true>,
+            InputTorus, Torus, params, PARTIALSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        partial_sm_accumulate_step_one));
    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, true>,
+            InputTorus, Torus, params, PARTIALSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, false>,
+            InputTorus, Torus, params, FULLSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        full_sm_accumulate_step_one));
    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, false>,
+            InputTorus, Torus, params, FULLSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, true>,
+            InputTorus, Torus, params, FULLSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        full_sm_accumulate_step_one));
    cudaFuncSetCacheConfig(
        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, true>,
+            InputTorus, Torus, params, FULLSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
@@ -644,7 +651,7 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+  auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
      gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
      level_count, full_sm_keybundle);
  uint64_t size_tracker = 0;
@@ -655,9 +662,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
  return size_tracker;
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void execute_compute_keybundle(
-    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
+    cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -686,7 +693,8 @@ __host__ void execute_compute_keybundle(
  dim3 thds(polynomial_size / params::opt, 1, 1);

  if (max_shared_memory < full_sm_keybundle) {
-    device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>
+    device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus, params,
+                                                      NOSM>
        <<<grid_keybundle, thds, 0, stream>>>(
            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -694,7 +702,8 @@ __host__ void execute_compute_keybundle(
            d_mem, full_sm_keybundle);
  } else {
    bool supports_tbc =
-        has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+        has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<InputTorus,
+                                                                 Torus>(
            num_samples, glwe_dimension, polynomial_size, level_count,
            cuda_get_max_shared_memory(gpu_index));

@@ -703,20 +712,22 @@ __host__ void execute_compute_keybundle(
      dim3 thds_new_keybundle(512, 1, 1);
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
+              InputTorus, Torus, Degree<2048>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
      cudaFuncSetCacheConfig(
          device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
+              InputTorus, Torus, Degree<2048>, FULLSM>,
          cudaFuncCachePreferShared);
      check_cuda_error(cudaGetLastError());
      device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-          Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
-                                         3 * full_sm_keybundle, stream>>>(
-          lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-          lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
+          InputTorus, Torus, Degree<2048>, FULLSM>
+          <<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
+             stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
+                       bootstrapping_key, lwe_dimension, lwe_offset, chunk_size,
+                       keybundle_size_per_input);
    } else {
-      device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
+      device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                        params, FULLSM>
          <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
              lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
              lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -727,10 +738,10 @@ __host__ void execute_compute_keybundle(
  check_cuda_error(cudaGetLastError());
 }

-template <typename Torus, class params, bool is_first_iter>
+template <typename InputTorus, typename Torus, class params, bool is_first_iter>
 __host__ void execute_step_one(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
@@ -754,7 +765,7 @@ __host__ void execute_step_one(

  if (max_shared_memory < partial_sm_accumulate_step_one)
    device_multi_bit_programmable_bootstrap_accumulate_step_one<
-        Torus, params, NOSM, is_first_iter>
+        InputTorus, Torus, params, NOSM, is_first_iter>
        <<<grid_accumulate_step_one, thds, 0, stream>>>(
            lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
            global_accumulator, global_accumulator_fft, lwe_dimension,
@@ -762,7 +773,7 @@ __host__ void execute_step_one(
            full_sm_accumulate_step_one);
  else if (max_shared_memory < full_sm_accumulate_step_one)
    device_multi_bit_programmable_bootstrap_accumulate_step_one<
-        Torus, params, PARTIALSM, is_first_iter>
+        InputTorus, Torus, params, PARTIALSM, is_first_iter>
        <<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
           stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
                     lut_vector_indexes, global_accumulator,
@@ -771,7 +782,7 @@ __host__ void execute_step_one(
                     partial_sm_accumulate_step_one);
  else
    device_multi_bit_programmable_bootstrap_accumulate_step_one<
-        Torus, params, FULLSM, is_first_iter>
+        InputTorus, Torus, params, FULLSM, is_first_iter>
        <<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
           stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
                     lut_vector_indexes, global_accumulator,
@@ -823,11 +834,11 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
  check_cuda_error(cudaGetLastError());
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -840,7 +851,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
       lwe_offset += lwe_chunk_size) {

    // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
+    execute_compute_keybundle<InputTorus, Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
        grouping_factor, level_count, lwe_offset);
@@ -853,12 +864,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
      bool is_last_iter =
          (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
      if (is_first_iter) {
-        execute_step_one<Torus, params, true>(
+        execute_step_one<InputTorus, Torus, params, true>(
            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
            lwe_input_indexes, buffer, num_samples, lwe_dimension,
            glwe_dimension, polynomial_size, base_log, level_count);
      } else {
-        execute_step_one<Torus, params, false>(
+        execute_step_one<InputTorus, Torus, params, false>(
            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
            lwe_input_indexes, buffer, num_samples, lwe_dimension,
            glwe_dimension, polynomial_size, base_log, level_count);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -98,9 +98,9 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(

  bool supports_cg =
      supports_cooperative_groups_on_multibit_programmable_bootstrap<
-          __uint128_t>(glwe_dimension, polynomial_size, level_count,
-                       input_lwe_ciphertext_count,
-                       cuda_get_max_shared_memory(gpu_index));
+          uint64_t, __uint128_t>(glwe_dimension, polynomial_size, level_count,
+                                 input_lwe_ciphertext_count,
+                                 cuda_get_max_shared_memory(gpu_index));

  if (supports_cg)
    return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
 *
 * Each y-block computes one element of the lwe_array_out.
 */
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_tbc(
    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    const Torus *__restrict__ lut_vector,
    const Torus *__restrict__ lut_vector_indexes,
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes,
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -83,7 +84,7 @@ __global__ void device_programmable_bootstrap_tbc(

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -99,8 +100,8 @@ __global__ void device_programmable_bootstrap_tbc(

  // Put "b" in [0, 2N[
  constexpr auto log_modulus = params::log2_degree + 1;
-  Torus b_hat = 0;
-  Torus correction = 0;
+  InputTorus b_hat = 0;
+  InputTorus correction = 0;
  if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
    correction = centered_binary_modulus_switch_body_correction_to_add(
        block_lwe_array_in, lwe_dimension, log_modulus);
@@ -117,7 +118,7 @@ __global__ void device_programmable_bootstrap_tbc(
    __syncthreads();

    // Put "a" in [0, 2N[
-    Torus a_hat = 0;
+    InputTorus a_hat = 0;
    modulus_switch(block_lwe_array_in[i], a_hat, log_modulus);

    // Perform ACC * (X^ä - 1)
@@ -200,12 +201,13 @@ __global__ void device_programmable_bootstrap_tbc(
  }
 }

-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_tbc_2_2_params(
    Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    const Torus *__restrict__ lut_vector,
    const Torus *__restrict__ lut_vector_indexes,
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes,
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
@@ -247,7 +249,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
  }
  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -263,8 +265,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(

  // Put "b" in [0, 2N[
  constexpr auto log_modulus = params::log2_degree + 1;
-  Torus b_hat = 0;
-  Torus correction = 0;
+  InputTorus b_hat = 0;
+  InputTorus correction = 0;
  if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
    correction = centered_binary_modulus_switch_body_correction_to_add(
        block_lwe_array_in, lwe_dimension, log_modulus);
@@ -276,7 +278,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
                                        params::degree / params::opt>(
      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
      false);
-  Torus temp_a_hat = 0;
+  InputTorus temp_a_hat = 0;
  for (int i = 0; i < lwe_dimension; i++) {

    // We calculate the modulus switch of a warp size of elements
@@ -382,7 +384,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
  cluster.sync();
 }

-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_tbc(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -410,27 +412,27 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(

  if (max_shared_memory >= full_sm + minimum_sm_tbc) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        partial_sm + minimum_sm_tbc));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
@@ -446,11 +448,11 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
 /*
 * Host wrapper
 */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_programmable_bootstrap_tbc(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -506,7 +508,8 @@ __host__ void host_programmable_bootstrap_tbc(
    config.dynamicSmemBytes = minimum_sm_tbc;

    check_cuda_error(cudaLaunchKernelEx(
-        &config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        &config,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
@@ -515,7 +518,8 @@ __host__ void host_programmable_bootstrap_tbc(
    config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;

    check_cuda_error(cudaLaunchKernelEx(
-        &config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        &config,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem,
@@ -530,18 +534,22 @@ __host__ void host_programmable_bootstrap_tbc(
      config.dynamicSmemBytes = full_sm_2_2;

      check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_2_2));
      check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
          cudaFuncAttributePreferredSharedMemoryCarveout,
          cudaSharedmemCarveoutMaxShared));
      check_cuda_error(cudaFuncSetCacheConfig(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
          cudaFuncCachePreferShared));
      check_cuda_error(cudaLaunchKernelEx(
          &config,
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
          lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
          lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
@@ -549,7 +557,8 @@ __host__ void host_programmable_bootstrap_tbc(
      config.dynamicSmemBytes = full_sm + minimum_sm_tbc;

      check_cuda_error(cudaLaunchKernelEx(
-          &config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+          &config,
+          device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
          lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
          lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
@@ -560,7 +569,7 @@ __host__ void host_programmable_bootstrap_tbc(
 }

 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
    int glwe_dimension, int level_count, int num_samples,
    uint32_t max_shared_memory) {
@@ -585,18 +594,21 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
  if (max_shared_memory < partial_sm) {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
-        0);
+        (void *)
+            device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
+        thds, 0);
  } else if (max_shared_memory < full_sm) {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        (void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
+                                                  PARTIALSM>,
        thds, partial_sm);
  } else {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
-        full_sm);
+        (void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
+                                                  FULLSM>,
+        thds, full_sm);
  }

  // Get the number of streaming multiprocessors
@@ -621,7 +633,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
  }
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory) {
@@ -661,34 +673,39 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
   * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
  if (max_shared_memory < partial_sm + minimum_sm_tbc) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
-        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
-    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
-        &cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
-        &config));
-  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
        &cluster_size,
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
+        &config));
+  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
+        &config));
  } else {
    if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
      check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
          cudaFuncAttributeNonPortableClusterSizeAllowed, false));
      check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
          &cluster_size,
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
          &config));
    } else {
      check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributeNonPortableClusterSizeAllowed, false));
      check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
          &cluster_size,
-          device_programmable_bootstrap_tbc<Torus, params, FULLSM>, &config));
+          device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
+          &config));
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -18,13 +18,14 @@
 #include "types/complex/operations.cuh"
 #include <vector>

-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_multi_bit_programmable_bootstrap_tbc_accumulate(
        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
        const double2 *__restrict__ keybundle_array, double2 *join_buffer,
        Torus *global_accumulator, uint32_t lwe_dimension,
@@ -67,7 +68,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  // The first dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -87,7 +88,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                   params::log2_degree + 1);

@@ -197,13 +198,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 //- Use a register based fft that uses the minimal synchronizations
 //- Register based fourier domain multiplication. Transfer fft's between blocks
 // instead of accumulator.
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params(
        Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        const Torus *__restrict__ lut_vector,
        const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
        const Torus *__restrict__ lwe_input_indexes,
        const double2 *__restrict__ keybundle_array, double2 *join_buffer,
        Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
@@ -247,7 +249,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)

  // The first dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
@@ -266,7 +268,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  Torus reg_acc_rotated[params::opt];
  if (lwe_offset == 0) {
    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                   params::log2_degree + 1);

@@ -400,7 +402,7 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
  return sizeof(Torus) * polynomial_size * 2; // accumulator
 }

-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
    pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -430,20 +432,22 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(

  if (max_shared_memory < full_sm_keybundle) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
@@ -451,58 +455,58 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
  if (max_shared_memory <
      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        minimum_sm_tbc_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory <
             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else {
    if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize,
          full_sm_tbc_accumulate + 2 * minimum_sm_tbc_accumulate));
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributePreferredSharedMemoryCarveout,
          cudaSharedmemCarveoutMaxShared));
      check_cuda_error(cudaFuncSetCacheConfig(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncCachePreferShared));
    } else {
      check_cuda_error(cudaFuncSetAttribute(
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize,
          full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
      cudaFuncSetCacheConfig(
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncCachePreferShared);
    }
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+  auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
      gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
      level_count, full_sm_keybundle);
  uint64_t size_tracker = 0;
@@ -513,10 +517,10 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
  return size_tracker;
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void execute_tbc_external_product_loop(
    cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -579,8 +583,8 @@ __host__ void execute_tbc_external_product_loop(
    config.dynamicSmemBytes = minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -591,8 +595,8 @@ __host__ void execute_tbc_external_product_loop(
    config.dynamicSmemBytes = partial_dm + minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
        &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -607,22 +611,22 @@ __host__ void execute_tbc_external_product_loop(
      config.dynamicSmemBytes = full_dm + 2 * minimum_dm;
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize,
          full_dm + 2 * minimum_dm));
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributePreferredSharedMemoryCarveout,
          cudaSharedmemCarveoutMaxShared));
      check_cuda_error(cudaFuncSetCacheConfig(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncCachePreferShared));
      check_cuda_error(cudaLaunchKernelEx(
          &config,
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
          lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
          global_accumulator, lwe_dimension, lwe_offset, chunk_size,
@@ -630,8 +634,8 @@ __host__ void execute_tbc_external_product_loop(
    } else {
      check_cuda_error(cudaLaunchKernelEx(
          &config,
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
          lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
          global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -642,11 +646,11 @@ __host__ void execute_tbc_external_product_loop(
  }
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_tbc_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
    Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -659,13 +663,13 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
       lwe_offset += lwe_chunk_size) {

    // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
+    execute_compute_keybundle<InputTorus, Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
        grouping_factor, level_count, lwe_offset);

    // Accumulate
-    execute_tbc_external_product_loop<Torus, params>(
+    execute_tbc_external_product_loop<InputTorus, Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
@@ -690,7 +694,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
  }
 }

-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t max_shared_memory) {
@@ -732,45 +736,45 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
  if (max_shared_memory <
      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
        &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
        &config));
  } else if (max_shared_memory <
             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
        &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
        &config));
  } else {
    if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
      check_cuda_error(cudaFuncSetAttribute(
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributeNonPortableClusterSizeAllowed, false));
      check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
          &cluster_size,
          device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
          &config));
    } else {
      check_cuda_error(cudaFuncSetAttribute(
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
          cudaFuncAttributeNonPortableClusterSizeAllowed, false));
      check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
          &cluster_size,
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
          &config));
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -166,14 +166,15 @@ public:
 #if CUDA_ARCH >= 900
 BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
 (benchmark::State &st) {
-  if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+  if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
+                                                                uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
          pbs_level, cuda_get_max_shared_memory(0))) {
    st.SkipWithError("Configuration not supported for tbc operation");
    return;
  }

-  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
+  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
      stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
      true);
@@ -181,7 +182,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
-    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                    uint64_t>(
        stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
        d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
@@ -197,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)

 BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
 (benchmark::State &st) {
-  if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
+  if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit_64(
          glwe_dimension, polynomial_size, pbs_level,
          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
    st.SkipWithError("Configuration not supported for fast operation");
@@ -254,7 +256,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
 #if CUDA_ARCH >= 900
 BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
 (benchmark::State &st) {
-  if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+  if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
          pbs_level, cuda_get_max_shared_memory(0))) {
    st.SkipWithError("Configuration not supported for tbc operation");
@@ -269,7 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
-    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
        stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
@@ -301,7 +303,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
-    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
        stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -187,7 +187,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
                      (lwe_dimension + 1));
      // Execute PBS
-      cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+      cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
          stream, gpu_index, (void *)d_lwe_ct_out_array,
          (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -3020,6 +3020,20 @@ unsafe extern "C" {
        noise_reduction_type: PBS_MS_REDUCTION_T,
    ) -> u64;
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_programmable_bootstrap_32_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
 unsafe extern "C" {
    pub fn scratch_cuda_programmable_bootstrap_128(
        stream: *mut ffi::c_void,
@@ -3035,7 +3049,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
        stream: *mut ffi::c_void,
        gpu_index: u32,
        lwe_array_out: *mut ffi::c_void,
@@ -3057,7 +3071,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
        stream: *mut ffi::c_void,
        gpu_index: u32,
        lwe_array_out: *mut ffi::c_void,
@@ -3110,6 +3124,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
+    #[link_name = "\u{1}_Z55has_support_to_cuda_programmable_bootstrap_cg_multi_bitjjjjj"]
    pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
        glwe_dimension: u32,
        polynomial_size: u32,
@@ -3144,6 +3159,18 @@ unsafe extern "C" {
        grouping_factor: u32,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
 unsafe extern "C" {
    pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
        stream: *mut ffi::c_void,
@@ -3179,6 +3206,29 @@ unsafe extern "C" {
        lut_stride: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
 unsafe extern "C" {
    pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
        stream: *mut ffi::c_void,
--- a/tfhe-benchmark/benches/core_crypto/ks_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_bench.rs
@@ -415,7 +415,7 @@ mod cuda {
            let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
            let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();

-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<Scalar, KeyswitchScalar> = CpuKeysBuilder::new()
                .keyswitch_key(ksk_big_to_small)
                .build();

@@ -667,7 +667,8 @@ mod cuda {
                &mut encryption_generator,
            );

-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
+            let cpu_keys: CpuKeys<Scalar, Scalar> =
+                CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();

            let bench_id;
            match get_bench_type() {
--- a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
@@ -511,12 +511,19 @@ mod cuda {
    };
    use tfhe::core_crypto::prelude::*;

-    fn cuda_ks_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize>(
+    fn cuda_ks_pbs<
+        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
+        KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
+    >(
        c: &mut Criterion,
        parameters: &[(String, CryptoParametersRecord<Scalar>)],
    ) {
-        let bench_name = "core_crypto::cuda::ks_pbs";
-        let mut bench_group = c.benchmark_group(bench_name);
+        let bench_name = format!(
+            "core_crypto::cuda::ks_pbs::ks{}::pbs{}",
+            KeyswitchScalar::BITS,
+            Scalar::BITS
+        );
+        let mut bench_group = c.benchmark_group(bench_name.clone());
        bench_group
            .sample_size(10)
            .measurement_time(std::time::Duration::from_secs(30));
@@ -535,29 +542,57 @@ mod cuda {
                continue;
            }

-            // Create the LweSecretKey
-            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
-                params.lwe_dimension.unwrap(),
-                &mut secret_generator,
-            );
-            let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
+            let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
+                DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_gaussian(
+                        gaussian_lwe_noise_distribution.standard_dev(),
+                    )
+                }
+                DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
+                        match KeyswitchScalar::BITS {
+                            32 => uniform_lwe_noise_distribution.bound_log2() - 32,
+                            64 => uniform_lwe_noise_distribution.bound_log2(),
+                            _ => panic!("Unsupported Keyswitch scalar input dtype"),
+                        },
+                    )
+                }
+            };
+
+            let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
+                allocate_and_generate_new_binary_lwe_secret_key(
+                    params.lwe_dimension.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
                allocate_and_generate_new_binary_glwe_secret_key(
                    params.glwe_dimension.unwrap(),
                    params.polynomial_size.unwrap(),
                    &mut secret_generator,
                );
-            let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
+            let big_lwe_sk = glwe_sk.into_lwe_secret_key();

            let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
-                &output_lwe_secret_key,
-                &input_lwe_secret_key,
+                &big_lwe_sk,
+                &lwe_sk,
                params.ks_base_log.unwrap(),
                params.ks_level.unwrap(),
-                params.lwe_noise_distribution.unwrap(),
+                lwe_noise_distribution_ksk,
                CiphertextModulus::new_native(),
                &mut encryption_generator,
            );

+            let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
+                allocate_and_generate_new_binary_glwe_secret_key(
+                    params.glwe_dimension.unwrap(),
+                    params.polynomial_size.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
+            let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
+
            let bsk = LweBootstrapKey::new(
                Scalar::ZERO,
                params.glwe_dimension.unwrap().to_glwe_size(),
@@ -568,7 +603,7 @@ mod cuda {
                params.ciphertext_modulus.unwrap(),
            );

-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
                .keyswitch_key(ksk_big_to_small)
                .bootstrap_key(bsk)
                .build();
@@ -582,7 +617,7 @@ mod cuda {

                    // Allocate a new LweCiphertext and encrypt our plaintext
                    let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
-                        &output_lwe_secret_key,
+                        &big_lwe_sk_64,
                        Plaintext(Scalar::ZERO),
                        params.lwe_noise_distribution.unwrap(),
                        params.ciphertext_modulus.unwrap(),
@@ -591,10 +626,10 @@ mod cuda {
                    let input_ks_ct_gpu =
                        CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);

-                    let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
-                        Scalar::ZERO,
-                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
-                        params.ciphertext_modulus.unwrap(),
+                    let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
+                        KeyswitchScalar::ZERO,
+                        lwe_sk.lwe_dimension().to_lwe_size(),
+                        ciphertext_modulus_out,
                    );
                    let mut output_ks_ct_gpu =
                        CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
@@ -611,7 +646,7 @@ mod cuda {
                    // Allocate the LweCiphertext to store the result of the PBS
                    let output_pbs_ct = LweCiphertext::new(
                        Scalar::ZERO,
-                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                        params.ciphertext_modulus.unwrap(),
                    );
                    let mut output_pbs_ct_gpu =
@@ -671,12 +706,12 @@ mod cuda {
                                .map(|i| {
                                    let mut input_ks_list = LweCiphertextList::new(
                                        Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
                                        params.ciphertext_modulus.unwrap(),
                                    );
                                    encrypt_lwe_ciphertext_list(
-                                        &output_lwe_secret_key,
+                                        &big_lwe_sk_64,
                                        &mut input_ks_list,
                                        &plaintext_list,
                                        params.lwe_noise_distribution.unwrap(),
@@ -692,10 +727,10 @@ mod cuda {
                            let output_ks_cts = (0..gpu_count)
                                .map(|i| {
                                    let output_ks_list = LweCiphertextList::new(
-                                        Scalar::ZERO,
-                                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        KeyswitchScalar::ZERO,
+                                        lwe_sk.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
-                                        params.ciphertext_modulus.unwrap(),
+                                        ciphertext_modulus_out,
                                    );
                                    CudaLweCiphertextList::from_lwe_ciphertext_list(
                                        &output_ks_list,
@@ -724,7 +759,7 @@ mod cuda {
                                .map(|i| {
                                    let output_pbs_ct = LweCiphertextList::new(
                                        Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
                                        params.ciphertext_modulus.unwrap(),
                                    );
@@ -822,12 +857,18 @@ mod cuda {

    fn cuda_multi_bit_ks_pbs<
        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Default + Serialize + Sync,
+        KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
    >(
        c: &mut Criterion,
        parameters: &[(String, CryptoParametersRecord<Scalar>, LweBskGroupingFactor)],
    ) {
-        let bench_name = "core_crypto::cuda::multi_bit_ks_pbs";
-        let mut bench_group = c.benchmark_group(bench_name);
+        let bench_name = format!(
+            "core_crypto::cuda::multi_bit_ks_pbs::ks_pbs::ks{}::pbs{}",
+            KeyswitchScalar::BITS,
+            Scalar::BITS
+        );
+
+        let mut bench_group = c.benchmark_group(bench_name.clone());
        bench_group
            .sample_size(10)
            .measurement_time(std::time::Duration::from_secs(30));
@@ -846,29 +887,57 @@ mod cuda {
                continue;
            }

-            // Create the LweSecretKey
-            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
-                params.lwe_dimension.unwrap(),
-                &mut secret_generator,
-            );
-            let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
+            let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
+                DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_gaussian(
+                        gaussian_lwe_noise_distribution.standard_dev(),
+                    )
+                }
+                DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
+                        match KeyswitchScalar::BITS {
+                            32 => uniform_lwe_noise_distribution.bound_log2() - 32,
+                            64 => uniform_lwe_noise_distribution.bound_log2(),
+                            _ => panic!("Unsupported Keyswitch scalar input dtype"),
+                        },
+                    )
+                }
+            };
+
+            let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
+                allocate_and_generate_new_binary_lwe_secret_key(
+                    params.lwe_dimension.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
                allocate_and_generate_new_binary_glwe_secret_key(
                    params.glwe_dimension.unwrap(),
                    params.polynomial_size.unwrap(),
                    &mut secret_generator,
                );
-            let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
+            let big_lwe_sk = glwe_sk.into_lwe_secret_key();

            let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
-                &output_lwe_secret_key,
-                &input_lwe_secret_key,
+                &big_lwe_sk,
+                &lwe_sk,
                params.ks_base_log.unwrap(),
                params.ks_level.unwrap(),
-                params.lwe_noise_distribution.unwrap(),
+                lwe_noise_distribution_ksk,
                CiphertextModulus::new_native(),
                &mut encryption_generator,
            );

+            let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
+                allocate_and_generate_new_binary_glwe_secret_key(
+                    params.glwe_dimension.unwrap(),
+                    params.polynomial_size.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
+            let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
+
            let multi_bit_bsk = LweMultiBitBootstrapKey::new(
                Scalar::ZERO,
                params.glwe_dimension.unwrap().to_glwe_size(),
@@ -880,7 +949,7 @@ mod cuda {
                params.ciphertext_modulus.unwrap(),
            );

-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
                .keyswitch_key(ksk_big_to_small)
                .multi_bit_bootstrap_key(multi_bit_bsk)
                .build();
@@ -894,7 +963,7 @@ mod cuda {

                    // Allocate a new LweCiphertext and encrypt our plaintext
                    let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
-                        &output_lwe_secret_key,
+                        &big_lwe_sk_64,
                        Plaintext(Scalar::ZERO),
                        params.lwe_noise_distribution.unwrap(),
                        params.ciphertext_modulus.unwrap(),
@@ -903,10 +972,10 @@ mod cuda {
                    let input_ks_ct_gpu =
                        CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);

-                    let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
-                        Scalar::ZERO,
-                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
-                        params.ciphertext_modulus.unwrap(),
+                    let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
+                        KeyswitchScalar::ZERO,
+                        lwe_sk.lwe_dimension().to_lwe_size(),
+                        ciphertext_modulus_out,
                    );
                    let mut output_ks_ct_gpu =
                        CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
@@ -923,7 +992,7 @@ mod cuda {
                    // Allocate the LweCiphertext to store the result of the PBS
                    let output_pbs_ct = LweCiphertext::new(
                        Scalar::ZERO,
-                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                        params.ciphertext_modulus.unwrap(),
                    );
                    let mut output_pbs_ct_gpu =
@@ -981,12 +1050,12 @@ mod cuda {
                                .map(|i| {
                                    let mut input_ks_list = LweCiphertextList::new(
                                        Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
                                        params.ciphertext_modulus.unwrap(),
                                    );
                                    encrypt_lwe_ciphertext_list(
-                                        &output_lwe_secret_key,
+                                        &big_lwe_sk_64,
                                        &mut input_ks_list,
                                        &plaintext_list,
                                        params.lwe_noise_distribution.unwrap(),
@@ -1002,10 +1071,10 @@ mod cuda {
                            let output_ks_cts = (0..gpu_count)
                                .map(|i| {
                                    let output_ks_list = LweCiphertextList::new(
-                                        Scalar::ZERO,
-                                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        KeyswitchScalar::ZERO,
+                                        lwe_sk.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
-                                        params.ciphertext_modulus.unwrap(),
+                                        ciphertext_modulus_out,
                                    );
                                    CudaLweCiphertextList::from_lwe_ciphertext_list(
                                        &output_ks_list,
@@ -1034,7 +1103,7 @@ mod cuda {
                                .map(|i| {
                                    let output_pbs_ct = LweCiphertextList::new(
                                        Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
                                        params.ciphertext_modulus.unwrap(),
                                    );
@@ -1132,12 +1201,17 @@ mod cuda {

    pub fn cuda_ks_pbs_group() {
        let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
-        cuda_ks_pbs(&mut criterion, &benchmark_parameters());
+        cuda_ks_pbs::<u64, u32>(&mut criterion, &benchmark_parameters());
+        cuda_ks_pbs::<u64, u64>(&mut criterion, &benchmark_parameters());
    }

    pub fn cuda_multi_bit_ks_pbs_group() {
        let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
-        cuda_multi_bit_ks_pbs(
+        cuda_multi_bit_ks_pbs::<u64, u32>(
+            &mut criterion,
+            &multi_bit_benchmark_parameters_with_grouping(),
+        );
+        cuda_multi_bit_ks_pbs::<u64, u64>(
            &mut criterion,
            &multi_bit_benchmark_parameters_with_grouping(),
        );
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -250,7 +250,7 @@ mod cuda {
                }
            };

-        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
+        let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new().bootstrap_key(bsk).build();

        let message_modulus: u64 = 1 << 4;
        let input_message: u64 = 3;
@@ -493,7 +493,7 @@ mod cuda {
            squash_params.ciphertext_modulus,
        );

-        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+        let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new()
            .multi_bit_bootstrap_key(multi_bit_bsk)
            .build();

--- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
@@ -978,7 +978,8 @@ mod cuda {
                params.ciphertext_modulus.unwrap(),
            );

-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
+            let cpu_keys: CpuKeys<Scalar, Scalar> =
+                CpuKeysBuilder::new().bootstrap_key(bsk).build();

            let bench_id;

@@ -1236,7 +1237,7 @@ mod cuda {
                params.ciphertext_modulus.unwrap(),
            );

-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<Scalar, Scalar> = CpuKeysBuilder::new()
                .multi_bit_bootstrap_key(multi_bit_bsk)
                .build();

--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -575,28 +575,28 @@ mod cuda_utils {
    }

    /// Computing keys in their CPU flavor.
-    pub struct CpuKeys<T: UnsignedInteger> {
-        ksk: Option<LweKeyswitchKeyOwned<T>>,
+    pub struct CpuKeys<T: UnsignedInteger, KST: UnsignedInteger> {
+        ksk: Option<LweKeyswitchKeyOwned<KST>>,
        pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
        bsk: Option<LweBootstrapKeyOwned<T>>,
        multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
    }

-    impl<T: UnsignedInteger> CpuKeys<T> {
-        pub fn builder() -> CpuKeysBuilder<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeys<T, KST> {
+        pub fn builder() -> CpuKeysBuilder<T, KST> {
            CpuKeysBuilder::new()
        }
    }

-    pub struct CpuKeysBuilder<T: UnsignedInteger> {
-        ksk: Option<LweKeyswitchKeyOwned<T>>,
+    pub struct CpuKeysBuilder<T: UnsignedInteger, KST: UnsignedInteger> {
+        ksk: Option<LweKeyswitchKeyOwned<KST>>,
        pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
        bsk: Option<LweBootstrapKeyOwned<T>>,
        multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
    }

-    impl<T: UnsignedInteger> CpuKeysBuilder<T> {
-        pub fn new() -> CpuKeysBuilder<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeysBuilder<T, KST> {
+        pub fn new() -> CpuKeysBuilder<T, KST> {
            Self {
                ksk: None,
                pksk: None,
@@ -605,7 +605,7 @@ mod cuda_utils {
            }
        }

-        pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<T>) -> CpuKeysBuilder<T> {
+        pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<KST>) -> CpuKeysBuilder<T, KST> {
            self.ksk = Some(ksk);
            self
        }
@@ -613,12 +613,12 @@ mod cuda_utils {
        pub fn packing_keyswitch_key(
            mut self,
            pksk: LwePackingKeyswitchKeyOwned<T>,
-        ) -> CpuKeysBuilder<T> {
+        ) -> CpuKeysBuilder<T, KST> {
            self.pksk = Some(pksk);
            self
        }

-        pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T> {
+        pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T, KST> {
            self.bsk = Some(bsk);
            self
        }
@@ -626,12 +626,12 @@ mod cuda_utils {
        pub fn multi_bit_bootstrap_key(
            mut self,
            mb_bsk: LweMultiBitBootstrapKeyOwned<T>,
-        ) -> CpuKeysBuilder<T> {
+        ) -> CpuKeysBuilder<T, KST> {
            self.multi_bit_bsk = Some(mb_bsk);
            self
        }

-        pub fn build(self) -> CpuKeys<T> {
+        pub fn build(self) -> CpuKeys<T, KST> {
            CpuKeys {
                ksk: self.ksk,
                pksk: self.pksk,
@@ -640,7 +640,7 @@ mod cuda_utils {
            }
        }
    }
-    impl<T: UnsignedInteger> Default for CpuKeysBuilder<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> Default for CpuKeysBuilder<T, KST> {
        fn default() -> Self {
            Self::new()
        }
@@ -648,17 +648,17 @@ mod cuda_utils {

    /// Computing keys in their Cuda flavor.
    #[allow(dead_code)]
-    pub struct CudaLocalKeys<T: UnsignedInteger> {
-        pub ksk: Option<CudaLweKeyswitchKey<T>>,
+    pub struct CudaLocalKeys<T: UnsignedInteger, KST: UnsignedInteger> {
+        pub ksk: Option<CudaLweKeyswitchKey<KST>>,
        pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
        pub bsk: Option<CudaLweBootstrapKey>,
        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
    }

    #[allow(dead_code)]
-    impl<T: UnsignedInteger> CudaLocalKeys<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> CudaLocalKeys<T, KST> {
        pub fn from_cpu_keys(
-            cpu_keys: &CpuKeys<T>,
+            cpu_keys: &CpuKeys<T, KST>,
            ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
            stream: &CudaStreams,
        ) -> Self {
@@ -681,10 +681,10 @@ mod cuda_utils {
    }

    /// Instantiate Cuda computing keys to each available GPU.
-    pub fn cuda_local_keys_core<T: UnsignedInteger>(
-        cpu_keys: &CpuKeys<T>,
+    pub fn cuda_local_keys_core<T: UnsignedInteger, KST: UnsignedInteger>(
+        cpu_keys: &CpuKeys<T, KST>,
        ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
-    ) -> Vec<CudaLocalKeys<T>> {
+    ) -> Vec<CudaLocalKeys<T, KST>> {
        let gpu_count = get_number_of_gpus() as usize;
        let mut gpu_keys_vec = Vec::with_capacity(gpu_count);
        for i in 0..gpu_count {
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
@@ -10,8 +10,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
 /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
 ///   be dropped until streams is synchronised
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
    output: &mut CudaLweCiphertextList<Scalar>,
    accumulator: &CudaGlweCiphertextList<Scalar>,
    lut_indexes: &CudaVec<Scalar>,
@@ -20,6 +20,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
    streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
    // CastInto required for PBS modulus switch which returns a usize
    Scalar: UnsignedTorus + CastInto<usize>,
 {
@@ -59,21 +60,22 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
        multi_bit_bsk.polynomial_size(),
    );

-    assert_eq!(
-        input.ciphertext_modulus(),
-        output.ciphertext_modulus(),
+    assert!(
+        input.ciphertext_modulus().associated_scalar_bits()
+            <= output.ciphertext_modulus().associated_scalar_bits(),
        "Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
        input.ciphertext_modulus(),
        output.ciphertext_modulus(),
    );

    assert_eq!(
-        input.ciphertext_modulus(),
+        output.ciphertext_modulus(),
        accumulator.ciphertext_modulus(),
-        "Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
+        "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
        input.ciphertext_modulus(),
        accumulator.ciphertext_modulus(),
    );
+
    assert_eq!(
        streams.gpu_indexes[0],
        multi_bit_bsk.d_vec.gpu_index(0),
@@ -144,8 +146,8 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
 }

 #[allow(clippy::too_many_arguments)]
-pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
    output: &mut CudaLweCiphertextList<Scalar>,
    accumulator: &CudaGlweCiphertextList<Scalar>,
    lut_indexes: &CudaVec<Scalar>,
@@ -154,6 +156,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
    multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
    streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
    // CastInto required for PBS modulus switch which returns a usize
    Scalar: UnsignedTorus + CastInto<usize>,
 {
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
@@ -12,8 +12,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
 /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
 ///   be dropped until streams is synchronised
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
    output: &mut CudaLweCiphertextList<Scalar>,
    accumulator: &CudaGlweCiphertextList<Scalar>,
    lut_indexes: &CudaVec<Scalar>,
@@ -22,6 +22,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
    bsk: &CudaLweBootstrapKey,
    streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
    // CastInto required for PBS modulus switch which returns a usize
    Scalar: UnsignedTorus + CastInto<usize>,
 {
@@ -259,8 +260,8 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
 }

 #[allow(clippy::too_many_arguments)]
-pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub fn cuda_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
    output: &mut CudaLweCiphertextList<Scalar>,
    accumulator: &CudaGlweCiphertextList<Scalar>,
    lut_indexes: &CudaVec<Scalar>,
@@ -269,6 +270,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
    bsk: &CudaLweBootstrapKey,
    streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
    Scalar: UnsignedTorus + CastInto<usize>,
 {
    unsafe {
--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs
@@ -4,8 +4,11 @@ use crate::core_crypto::gpu::lwe_bootstrap_key::{
    CudaLweBootstrapKey, CudaModulusSwitchNoiseReductionConfiguration,
 };
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
 use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
-use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
+use crate::core_crypto::gpu::{
+    cuda_keyswitch_lwe_ciphertext, cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams,
+};
 use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
 use itertools::Itertools;

@@ -174,6 +177,218 @@ fn lwe_encrypt_pbs_decrypt<
    }
 }

+fn lwe_ks_encrypt_pbs_decrypt_compare_32_64<
+    Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + CastFrom<u32> + CastInto<u32>,
+>(
+    params: ClassicTestParams<Scalar>,
+) {
+    assert!(Scalar::BITS <= 64);
+
+    let input_lwe_dimension = params.lwe_dimension;
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let glwe_noise_distribution = params.glwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+    let glwe_dimension = params.glwe_dimension;
+    let polynomial_size = params.polynomial_size;
+    let decomp_base_log = params.pbs_base_log;
+    let decomp_level_count = params.pbs_level;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let mut rsc = TestResources::new();
+
+    let f = |x: Scalar| {
+        x.wrapping_mul(Scalar::TWO)
+            .wrapping_sub(Scalar::ONE)
+            .wrapping_rem(msg_modulus)
+    };
+
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+    let mut msg = msg_modulus;
+    const NB_TESTS: usize = 10;
+    let number_of_messages = 1;
+
+    let accumulator = generate_programmable_bootstrap_glwe_lut(
+        polynomial_size,
+        glwe_dimension.to_glwe_size(),
+        msg_modulus.cast_into(),
+        ciphertext_modulus,
+        delta,
+        f,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &accumulator,
+        ciphertext_modulus
+    ));
+
+    let lwe_noise_distribution_ksk_32 = match params.lwe_noise_distribution {
+        DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
+            DynamicDistribution::<u32>::new_gaussian(gaussian_lwe_noise_distribution.standard_dev())
+        }
+        DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
+            DynamicDistribution::<u32>::new_t_uniform(
+                uniform_lwe_noise_distribution.bound_log2() - 32,
+            )
+        }
+    };
+
+    // Create the LweSecretKey
+    let intermediate_lwe_secret_key_32: LweSecretKeyOwned<u32> =
+        allocate_and_generate_new_binary_lwe_secret_key(
+            input_lwe_dimension,
+            &mut rsc.secret_random_generator,
+        ); // lwe_sk
+
+    let glwe_sk_64: GlweSecretKeyOwned<Scalar> = allocate_and_generate_new_binary_glwe_secret_key(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    let big_lwe_sk_64 = glwe_sk_64.clone().into_lwe_secret_key();
+
+    let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
+        &big_lwe_sk_64,
+        &intermediate_lwe_secret_key_32,
+        params.ks_base_log,
+        params.ks_level,
+        lwe_noise_distribution_ksk_32,
+        CiphertextModulus::<u32>::new_native(),
+        &mut rsc.encryption_random_generator,
+    );
+    let d_ksk_big_to_small =
+        CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream);
+
+    let ciphertext_modulus_ks32 = CiphertextModulus::<u32>::new_native();
+
+    let output_lwe_dimension = big_lwe_sk_64.lwe_dimension();
+
+    let mut bsk = LweBootstrapKey::new(
+        Scalar::ZERO,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        decomp_base_log,
+        decomp_level_count,
+        input_lwe_dimension,
+        ciphertext_modulus,
+    );
+
+    par_generate_lwe_bootstrap_key(
+        &intermediate_lwe_secret_key_32,
+        &glwe_sk_64,
+        &mut bsk,
+        glwe_noise_distribution,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &*bsk,
+        ciphertext_modulus
+    ));
+
+    let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let plaintext = Plaintext(msg * delta);
+
+            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
+                &big_lwe_sk_64,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus,
+                &mut rsc.encryption_random_generator,
+            );
+
+            assert!(check_encrypted_content_respects_mod(
+                &lwe_ciphertext_in,
+                ciphertext_modulus
+            ));
+
+            let d_lwe_ciphertext_in =
+                CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+
+            let output_ks_ct: LweCiphertextOwned<u32> = LweCiphertext::new(
+                0u32,
+                intermediate_lwe_secret_key_32.lwe_dimension().to_lwe_size(),
+                ciphertext_modulus_ks32,
+            );
+            let mut output_ks_ct_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &stream);
+
+            let mut d_out_pbs_ct = CudaLweCiphertextList::new(
+                output_lwe_dimension,
+                LweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+
+            let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            let test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
+
+            let mut d_test_vector_indexes =
+                unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
+            unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
+
+            let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
+            let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
+            let lwe_indexes = lwe_indexes_usize
+                .iter()
+                .map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
+                .collect_vec();
+            let mut d_output_indexes =
+                unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+            let mut d_input_indexes =
+                unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+            unsafe {
+                d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+                d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+            }
+
+            cuda_keyswitch_lwe_ciphertext(
+                &d_ksk_big_to_small,
+                &d_lwe_ciphertext_in,
+                &mut output_ks_ct_gpu,
+                &d_input_indexes,
+                &d_output_indexes,
+                true,
+                &stream,
+                false,
+            );
+
+            cuda_programmable_bootstrap_lwe_ciphertext(
+                &output_ks_ct_gpu,
+                &mut d_out_pbs_ct,
+                &d_accumulator,
+                &d_test_vector_indexes,
+                &d_output_indexes,
+                &d_input_indexes,
+                &d_bsk,
+                &stream,
+            );
+
+            let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
+            assert!(check_encrypted_content_respects_mod(
+                &out_pbs_ct,
+                ciphertext_modulus
+            ));
+
+            let decrypted = decrypt_lwe_ciphertext(&big_lwe_sk_64, &out_pbs_ct);
+
+            let decoded = round_decode(decrypted.0, delta) % msg_modulus;
+
+            assert_eq!(decoded, f(msg));
+        }
+    }
+}
+
 fn lwe_encrypt_centered_ms_pbs_decrypt<
    Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize>,
 >(
@@ -327,6 +542,6 @@ fn lwe_encrypt_centered_ms_pbs_decrypt<
        }
    }
 }
-
+create_gpu_parameterized_test!(lwe_ks_encrypt_pbs_decrypt_compare_32_64);
 create_gpu_parameterized_test!(lwe_encrypt_pbs_decrypt);
 create_gpu_parameterized_test!(lwe_encrypt_centered_ms_pbs_decrypt);
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -166,13 +166,13 @@ impl Drop for CudaStreams {
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
+pub unsafe fn programmable_bootstrap_async<KST: UnsignedInteger, T: UnsignedInteger>(
    streams: &CudaStreams,
    lwe_array_out: &mut CudaVec<T>,
    lwe_out_indexes: &CudaVec<T>,
    test_vector: &CudaVec<T>,
    test_vector_indexes: &CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
+    lwe_array_in: &CudaVec<KST>,
    lwe_in_indexes: &CudaVec<T>,
    bootstrapping_key: &CudaVec<f64>,
    lwe_dimension: LweDimension,
@@ -193,45 +193,89 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
            PBSMSNoiseReductionType::Centered
        });

-    scratch_cuda_programmable_bootstrap_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        level.0 as u32,
-        num_samples,
-        true,
-        noise_reduction_type as u32,
-    );
+    if KST::BITS == 32 {
+        scratch_cuda_programmable_bootstrap_32_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+            noise_reduction_type as u32,
+        );

-    cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_out_indexes.as_c_ptr(0),
-        test_vector.as_c_ptr(0),
-        test_vector_indexes.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        lwe_in_indexes.as_c_ptr(0),
-        bootstrapping_key.as_c_ptr(0),
-        pbs_buffer,
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        base_log.0 as u32,
-        level.0 as u32,
-        num_samples,
-        num_many_lut,
-        lut_stride,
-    );
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );

-    cleanup_cuda_programmable_bootstrap(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-    );
+        cleanup_cuda_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else if KST::BITS == 64 {
+        scratch_cuda_programmable_bootstrap_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+            noise_reduction_type as u32,
+        );
+
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+
+        cleanup_cuda_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else {
+        panic!("Unsupported PBS input bitwidth {}", KST::BITS);
+    }
 }

 #[allow(clippy::too_many_arguments)]
@@ -348,6 +392,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
 #[allow(clippy::too_many_arguments)]
 pub unsafe fn programmable_bootstrap_multi_bit_async<
    T: UnsignedInteger,
+    KST: UnsignedInteger,
    B: Any + UnsignedInteger,
 >(
    streams: &CudaStreams,
@@ -355,7 +400,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
    output_indexes: &CudaVec<T>,
    test_vector: &CudaVec<B>,
    test_vector_indexes: &CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
+    lwe_array_in: &CudaVec<KST>,
    input_indexes: &CudaVec<T>,
    bootstrapping_key: &CudaVec<B>,
    lwe_dimension: LweDimension,
@@ -406,42 +451,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
            std::ptr::addr_of_mut!(pbs_buffer),
        );
    } else if TypeId::of::<B>() == TypeId::of::<u64>() {
-        scratch_cuda_multi_bit_programmable_bootstrap_64(
-            streams.ptr[0],
-            streams.gpu_indexes[0].get(),
-            std::ptr::addr_of_mut!(pbs_buffer),
-            glwe_dimension.0 as u32,
-            polynomial_size.0 as u32,
-            level.0 as u32,
-            num_samples,
-            true,
-        );
-        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-            streams.ptr[0],
-            streams.gpu_indexes[0].get(),
-            lwe_array_out.as_mut_c_ptr(0),
-            output_indexes.as_c_ptr(0),
-            test_vector.as_c_ptr(0),
-            test_vector_indexes.as_c_ptr(0),
-            lwe_array_in.as_c_ptr(0),
-            input_indexes.as_c_ptr(0),
-            bootstrapping_key.as_c_ptr(0),
-            pbs_buffer,
-            lwe_dimension.0 as u32,
-            glwe_dimension.0 as u32,
-            polynomial_size.0 as u32,
-            grouping_factor.0 as u32,
-            base_log.0 as u32,
-            level.0 as u32,
-            num_samples,
-            num_many_lut,
-            lut_stride,
-        );
-        cleanup_cuda_multi_bit_programmable_bootstrap(
-            streams.ptr[0],
-            streams.gpu_indexes[0].get(),
-            std::ptr::addr_of_mut!(pbs_buffer),
-        );
+        if TypeId::of::<KST>() == TypeId::of::<u32>() {
+            scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                level.0 as u32,
+                num_samples,
+                true,
+            );
+            cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                lwe_array_out.as_mut_c_ptr(0),
+                output_indexes.as_c_ptr(0),
+                test_vector.as_c_ptr(0),
+                test_vector_indexes.as_c_ptr(0),
+                lwe_array_in.as_c_ptr(0),
+                input_indexes.as_c_ptr(0),
+                bootstrapping_key.as_c_ptr(0),
+                pbs_buffer,
+                lwe_dimension.0 as u32,
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+                base_log.0 as u32,
+                level.0 as u32,
+                num_samples,
+                num_many_lut,
+                lut_stride,
+            );
+            cleanup_cuda_multi_bit_programmable_bootstrap(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+            );
+        } else if TypeId::of::<KST>() == TypeId::of::<u64>() {
+            scratch_cuda_multi_bit_programmable_bootstrap_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                level.0 as u32,
+                num_samples,
+                true,
+            );
+            cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                lwe_array_out.as_mut_c_ptr(0),
+                output_indexes.as_c_ptr(0),
+                test_vector.as_c_ptr(0),
+                test_vector_indexes.as_c_ptr(0),
+                lwe_array_in.as_c_ptr(0),
+                input_indexes.as_c_ptr(0),
+                bootstrapping_key.as_c_ptr(0),
+                pbs_buffer,
+                lwe_dimension.0 as u32,
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+                base_log.0 as u32,
+                level.0 as u32,
+                num_samples,
+                num_many_lut,
+                lut_stride,
+            );
+            cleanup_cuda_multi_bit_programmable_bootstrap(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+            );
+        } else {
+            panic!("Unsupported MB PBS input torus size");
+        }
    } else {
        panic!("Unsupported torus size")
    }