diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
index 0c8c187fc..21889dde4 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -7,23 +7,23 @@ template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
     uint32_t polynomial_size, uint32_t max_shared_memory);
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory);
 
 #if CUDA_ARCH >= 900
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -48,7 +48,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride);
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -96,7 +96,7 @@ template <typename Torus>
 uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
     uint32_t polynomial_size);
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
                             uint32_t polynomial_size, uint32_t glwe_dimension,
                             uint32_t level_count, uint64_t full_sm_keybundle);
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
index b131e8ff2..75587618f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -428,29 +428,29 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
   return buffer_size + buffer_size % sizeof(double2);
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
                                                    uint32_t num_samples,
                                                    uint32_t max_shared_memory);
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -458,11 +458,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t lut_stride);
 
 #if (CUDA_ARCH >= 900)
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -477,14 +477,14 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
     bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 #endif
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap_cg(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
     bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
index 0b87999f6..5cc3cb509 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -69,13 +69,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
 
+uint64_t scratch_cuda_programmable_bootstrap_32_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
 uint64_t scratch_cuda_programmable_bootstrap_128(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
 
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_output_indexes, void const *lut_vector,
     void const *lut_vector_indexes, void const *lwe_array_in,
@@ -84,7 +90,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
 
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_output_indexes, void const *lut_vector,
     void const *lut_vector_indexes, void const *lwe_array_in,
diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
index 6679ed09c..ca7fc6214 100644
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -4,12 +4,12 @@
 #include "pbs_enums.h"
 #include "stdint.h"
 
-extern "C" {
-
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t num_samples, uint32_t max_shared_memory);
 
+extern "C" {
+
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
     void *stream, uint32_t gpu_index, void *dest, void const *src,
     uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
@@ -20,6 +20,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
     uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
     uint32_t polynomial_size, uint32_t grouping_factor);
 
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -35,6 +40,16 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                    uint32_t gpu_index,
                                                    int8_t **pbs_buffer);
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index 9b024c73b..efb83da0c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -535,7 +535,6 @@ __host__ void host_gemm_keyswitch_lwe_ciphertext_vector(
   dim3 grid_negate(CEIL_DIV(lwe_dimension_out + 1, BLOCK_SIZE_DECOMP),
                    CEIL_DIV(num_samples, BLOCK_SIZE_DECOMP));
   dim3 threads_negate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
-
   // Negate all outputs in the output LWEs. This is the final step in the GEMM
   // keyswitch computed as: -(-b + sum(a_i A_KSK))
   keyswitch_negate_with_output_indices<Torus, KSTorus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 9d9aa757e..ea2f76a34 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -263,42 +263,7 @@ void execute_pbs_async(CudaStreams streams,
                        uint32_t num_many_lut, uint32_t lut_stride) {
 
   if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
-    // 32 bits
-    switch (pbs_type) {
-    case MULTI_BIT:
-      PANIC("Error: 32-bit multibit PBS is not supported.\n")
-    case CLASSICAL:
-      for (uint i = 0; i < streams.count(); i++) {
-        int num_inputs_on_gpu = get_num_inputs_on_gpu(
-            input_lwe_ciphertext_count, i, streams.count());
-
-        int gpu_offset =
-            get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
-        auto d_lut_vector_indexes =
-            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
-
-        // Use the macro to get the correct elements for the current iteration
-        // Handles the case when the input/output are scattered through
-        // different gpus and when it is not
-        auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
-        auto current_lwe_output_indexes =
-            get_variant_element(lwe_output_indexes, i);
-        auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
-        auto current_lwe_input_indexes =
-            get_variant_element(lwe_input_indexes, i);
-
-        cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-            streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
-            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
-            current_lwe_array_in, current_lwe_input_indexes,
-            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            num_many_lut, lut_stride);
-      }
-      break;
-    default:
-      PANIC("Error: unsupported cuda PBS type.")
-    }
+    PANIC("Error: unsupported 32b CUDA PBS type.")
   } else if constexpr (std::is_same_v<OutputTorus, uint64_t>) {
     // 64 bits
     switch (pbs_type) {
@@ -353,7 +318,7 @@ void execute_pbs_async(CudaStreams streams,
         auto d_lut_vector_indexes =
             lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
 
-        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
             streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
             current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 48601c3ab..309004ee2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
  *
  * Each y-block computes one element of the lwe_array_out.
  */
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_cg(
     Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
     const Torus *__restrict__ lut_vector,
     const Torus *__restrict__ lut_vector_indexes,
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
     const Torus *__restrict__ lwe_input_indexes,
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -79,7 +80,7 @@ __global__ void device_programmable_bootstrap_cg(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -95,8 +96,8 @@ __global__ void device_programmable_bootstrap_cg(
 
   // Put "b" in [0, 2N[
   constexpr auto log_modulus = params::log2_degree + 1;
-  Torus b_hat = 0;
-  Torus correction = 0;
+  InputTorus b_hat = 0;
+  InputTorus correction = 0;
   if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
     correction = centered_binary_modulus_switch_body_correction_to_add(
         block_lwe_array_in, lwe_dimension, log_modulus);
@@ -113,7 +114,7 @@ __global__ void device_programmable_bootstrap_cg(
     __syncthreads();
 
     // Put "a" in [0, 2N[
-    Torus a_hat = 0;
+    InputTorus a_hat = 0;
     modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
 
     // Perform ACC * (X^ä - 1)
@@ -193,7 +194,7 @@ __global__ void device_programmable_bootstrap_cg(
   }
 }
 
-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_cg(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -210,18 +211,18 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
   auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else if (max_shared_memory >= partial_sm) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
@@ -237,11 +238,11 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
 /*
  * Host wrapper
  */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_programmable_bootstrap_cg(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -293,26 +294,29 @@ __host__ void host_programmable_bootstrap_cg(
   if (max_shared_memory < partial_sm) {
     kernel_args[13] = &full_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, grid,
-        thds, (void **)kernel_args, 0, stream));
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
+        grid, thds, (void **)kernel_args, 0, stream));
   } else if (max_shared_memory < full_sm) {
     kernel_args[13] = &partial_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        (void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
+                                                 PARTIALSM>,
         grid, thds, (void **)kernel_args, partial_sm, stream));
   } else {
     int no_dm = 0;
     kernel_args[13] = &no_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, grid,
-        thds, (void **)kernel_args, full_sm, stream));
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
+        grid, thds, (void **)kernel_args, full_sm, stream));
   }
 
   check_cuda_error(cudaGetLastError());
 }
 
 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
     int glwe_dimension, int level_count, int num_samples,
     uint32_t max_shared_memory) {
@@ -338,30 +342,34 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
   if (max_shared_memory < partial_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, thds, 0);
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
+        thds, 0);
   } else if (max_shared_memory < full_sm) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
+        (void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
+                                                 PARTIALSM>,
         thds, partial_sm);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
+        device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
         cudaFuncCachePreferShared);
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, thds,
-        full_sm);
+        (void *)
+            device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
+        thds, full_sm);
     check_cuda_error(cudaGetLastError());
   }
 
@@ -372,39 +380,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 }
 
 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
     int glwe_dimension, int polynomial_size, int level_count, int num_samples,
     uint32_t max_shared_memory) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<2048>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
           "Supported N's are powers of two"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index db0c27959..bbdacfce6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -18,13 +18,14 @@
 #include "types/complex/operations.cuh"
 #include <vector>
 
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_cg_accumulate(
         Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
         const Torus *__restrict__ lut_vector,
         const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
         const Torus *__restrict__ lwe_input_indexes,
         const double2 *__restrict__ keybundle_array, double2 *join_buffer,
         Torus *global_accumulator, uint32_t lwe_dimension,
@@ -60,7 +61,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -80,7 +81,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
     modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                    params::log2_degree + 1);
 
@@ -208,7 +209,7 @@ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
   return buffer_size + buffer_size % sizeof(double2);
 }
 
-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -230,57 +231,59 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
   auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
 
   if (max_shared_memory < partial_sm_cg_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              NOSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, NOSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              NOSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, NOSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else if (max_shared_memory < full_sm_cg_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
 
-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+  auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
       gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
       level_count, full_sm_keybundle);
   uint64_t size_tracker = 0;
@@ -291,10 +294,10 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
   return size_tracker;
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void execute_cg_external_product_loop(
     cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
     uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -359,28 +362,28 @@ __host__ void execute_cg_external_product_loop(
     kernel_args[19] = &full_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, NOSM>,
+            InputTorus, Torus, params, NOSM>,
         grid_accumulate, thds, (void **)kernel_args, 0, stream));
   } else if (max_shared_memory < full_dm) {
     kernel_args[19] = &partial_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, PARTIALSM>,
+            InputTorus, Torus, params, PARTIALSM>,
         grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
   } else {
     kernel_args[19] = &no_dm;
     check_cuda_error(cudaLaunchCooperativeKernel(
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, FULLSM>,
+            InputTorus, Torus, params, FULLSM>,
         grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
   }
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_cg_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -393,13 +396,13 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
        lwe_offset += lwe_chunk_size) {
 
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
+    execute_compute_keybundle<InputTorus, Torus, params>(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, level_count, lwe_offset);
 
     // Accumulate
-    execute_cg_external_product_loop<Torus, params>(
+    execute_cg_external_product_loop<InputTorus, Torus, params>(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
@@ -409,7 +412,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
 }
 
 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
     int glwe_dimension, int level_count, int num_samples,
     uint32_t max_shared_memory) {
@@ -436,36 +439,36 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, NOSM>,
+            InputTorus, Torus, params, NOSM>,
         thds, 0);
   } else if (max_shared_memory < full_sm_cg_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, PARTIALSM>,
+            InputTorus, Torus, params, PARTIALSM>,
         thds, partial_sm_cg_accumulate);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
+        device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
+                                                              params, FULLSM>,
         cudaFuncCachePreferShared);
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
         (void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
-            Torus, params, FULLSM>,
+            InputTorus, Torus, params, FULLSM>,
         thds, full_sm_cg_accumulate);
     check_cuda_error(cudaGetLastError());
   }
@@ -478,39 +481,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
 
 // Verify if the grid size for the multi-bit kernel satisfies the cooperative
 // group constraints
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
     int glwe_dimension, int polynomial_size, int level_count, int num_samples,
     uint32_t max_shared_memory) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<2048>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index b95260f33..5ac3913f5 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -7,18 +7,19 @@
 
 #include <stdio.h>
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
                                                    uint32_t num_samples,
                                                    uint32_t max_shared_memory) {
-  return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
+  return supports_cooperative_groups_on_programmable_bootstrap<InputTorus,
+                                                               Torus>(
       glwe_dimension, polynomial_size, level_count, num_samples,
       max_shared_memory);
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory) {
@@ -28,38 +29,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
   switch (polynomial_size) {
   case 256:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
   case 512:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
   case 1024:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 2048:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, Degree<2048>>(num_samples, glwe_dimension, polynomial_size,
-                             level_count, max_shared_memory);
+        InputTorus, Torus, Degree<2048>>(num_samples, glwe_dimension,
+                                         polynomial_size, level_count,
+                                         max_shared_memory);
   case 4096:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 8192:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 16384:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
-        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
+                                                   polynomial_size, level_count,
+                                                   max_shared_memory);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
@@ -71,7 +73,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
 }
 
 #if (CUDA_ARCH >= 900)
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap_tbc(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -80,37 +82,43 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
 
   switch (polynomial_size) {
   case 256:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 512:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 1024:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 2048:
-    return scratch_programmable_bootstrap_tbc<Torus, Degree<2048>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 4096:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 8192:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 16384:
-    return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
+    return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
+                                              AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -121,11 +129,11 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
   }
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -134,7 +142,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 
   switch (polynomial_size) {
   case 256:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -142,7 +150,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 512:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -150,7 +158,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 1024:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -158,7 +166,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 2048:
-    host_programmable_bootstrap_tbc<Torus, Degree<2048>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -166,7 +174,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 4096:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -174,7 +182,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 8192:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -182,7 +190,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 16384:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
+    host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -197,7 +205,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 }
 #endif
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap_cg(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -206,37 +214,44 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
 
   switch (polynomial_size) {
   case 256:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 512:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 1024:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 2048:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 4096:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 8192:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 16384:
-    return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
+    return scratch_programmable_bootstrap_cg<InputTorus, Torus,
+                                             AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -247,7 +262,7 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
   }
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -256,37 +271,44 @@ uint64_t scratch_cuda_programmable_bootstrap(
 
   switch (polynomial_size) {
   case 256:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 512:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 1024:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 2048:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 4096:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 8192:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   case 16384:
-    return scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    return scratch_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -299,11 +321,10 @@ uint64_t scratch_cuda_programmable_bootstrap(
 
 /*
  * This scratch function allocates the necessary amount of data on the GPU for
- * the classical PBS on 32 bits inputs, into `buffer`. It also
- * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
- * be used.
+ * the PBS on 64 bits inputs, into `buffer`. It also configures SM options on
+ * the GPU in case FULLSM or PARTIALSM mode is going to be used.
  */
-uint64_t scratch_cuda_programmable_bootstrap_32(
+uint64_t scratch_cuda_programmable_bootstrap_32_64(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
@@ -311,25 +332,25 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
 
   auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
-  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
+  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
           level_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
-        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
+    return scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   else
 #endif
-      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
+      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
               glwe_dimension, polynomial_size, level_count,
               input_lwe_ciphertext_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_cg<uint32_t>(
-        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
+    return scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   else
-    return scratch_cuda_programmable_bootstrap<uint32_t>(
-        stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
+    return scratch_cuda_programmable_bootstrap<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
 }
@@ -347,34 +368,34 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
 
   auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
 #if (CUDA_ARCH >= 900)
-  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
           level_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
+    return scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   else
 #endif
-      if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
+      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
               glwe_dimension, polynomial_size, level_count,
               input_lwe_ciphertext_count, max_shared_memory))
-    return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
+    return scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
   else
-    return scratch_cuda_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -383,7 +404,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
 
   switch (polynomial_size) {
   case 256:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -391,7 +412,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 512:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -399,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 1024:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -407,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 2048:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -415,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 4096:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -423,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 8192:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -431,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 16384:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
+    host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -445,11 +466,11 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
   }
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -458,7 +479,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
 
   switch (polynomial_size) {
   case 256:
-    host_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -466,7 +487,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 512:
-    host_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -474,7 +495,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 1024:
-    host_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -482,7 +503,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 2048:
-    host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -490,7 +511,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 4096:
-    host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -498,7 +519,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 8192:
-    host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -506,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         num_many_lut, lut_stride);
     break;
   case 16384:
-    host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -522,7 +543,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
 
 /* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
  */
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_output_indexes, void const *lut_vector,
     void const *lut_vector_indexes, void const *lwe_array_in,
@@ -530,23 +551,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")
 
-  if (base_log > 32)
-    PANIC("Cuda error (classical PBS): base log should be <= 32")
+  pbs_buffer<uint64_t, CLASSICAL> *buffer =
+      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
 
-  pbs_buffer<uint32_t, CLASSICAL> *buffer =
-      (pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;
+  check_cuda_error(cudaGetLastError());
 
   switch (buffer->pbs_variant) {
-  case TBC:
-#if CUDA_ARCH >= 900
-    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
-        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-        static_cast<const uint32_t *>(lwe_output_indexes),
-        static_cast<const uint32_t *>(lut_vector),
-        static_cast<const uint32_t *>(lut_vector_indexes),
+  case PBS_VARIANT::TBC:
+#if (CUDA_ARCH >= 900)
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
         static_cast<const uint32_t *>(lwe_array_in),
-        static_cast<const uint32_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
         num_many_lut, lut_stride);
@@ -554,26 +576,26 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
-  case CG:
-    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
-        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-        static_cast<const uint32_t *>(lwe_output_indexes),
-        static_cast<const uint32_t *>(lut_vector),
-        static_cast<const uint32_t *>(lut_vector_indexes),
+  case PBS_VARIANT::CG:
+    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t, uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
         static_cast<const uint32_t *>(lwe_array_in),
-        static_cast<const uint32_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
         num_many_lut, lut_stride);
     break;
-  case DEFAULT:
-    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
-        stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-        static_cast<const uint32_t *>(lwe_output_indexes),
-        static_cast<const uint32_t *>(lut_vector),
-        static_cast<const uint32_t *>(lut_vector_indexes),
+  case PBS_VARIANT::DEFAULT:
+    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t, uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
         static_cast<const uint32_t *>(lwe_array_in),
-        static_cast<const uint32_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(lwe_input_indexes),
         static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
         num_many_lut, lut_stride);
@@ -644,7 +666,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
  * 	- the constant memory (64K) is used for storing the roots of identity
  * values for the FFT
  */
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void const *lwe_output_indexes, void const *lut_vector,
     void const *lut_vector_indexes, void const *lwe_array_in,
@@ -663,7 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
   switch (buffer->pbs_variant) {
   case PBS_VARIANT::TBC:
 #if (CUDA_ARCH >= 900)
-    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
         static_cast<const uint64_t *>(lwe_output_indexes),
         static_cast<const uint64_t *>(lut_vector),
@@ -678,7 +700,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
   case PBS_VARIANT::CG:
-    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
         static_cast<const uint64_t *>(lwe_output_indexes),
         static_cast<const uint64_t *>(lut_vector),
@@ -690,7 +712,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         num_many_lut, lut_stride);
     break;
   case PBS_VARIANT::DEFAULT:
-    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
         static_cast<const uint64_t *>(lwe_output_indexes),
         static_cast<const uint64_t *>(lut_vector),
@@ -705,7 +727,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     PANIC("Cuda error (PBS): unknown pbs variant.")
   }
 }
-
 /*
  * This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
  * bits inputs.
@@ -718,11 +739,16 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
   *buffer = nullptr;
 }
 
-template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
+template bool has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t num_samples, uint32_t max_shared_memory);
 
-template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
+template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory);
+
+template void
+cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
     uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -732,7 +758,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
-template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+template void
+cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
     uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -742,14 +769,14 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
-template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t>(
+template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
 
-template uint64_t scratch_cuda_programmable_bootstrap<uint64_t>(
+template uint64_t scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
@@ -765,38 +792,31 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
 
-template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
-    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
-    uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
-    uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
-    uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
-    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
-template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t>(
+template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
     void *stream, uint32_t gpu_index,
-    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
+    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
 
-template uint64_t scratch_cuda_programmable_bootstrap<uint32_t>(
+template uint64_t scratch_cuda_programmable_bootstrap<uint32_t, uint32_t>(
     void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
     bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 
-template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
+template bool
+has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory);
-template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+template bool
+has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory);
 
 #if CUDA_ARCH >= 900
-template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
+template void
+cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint32_t>(
     void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
     uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
     uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
@@ -805,7 +825,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
-template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+template void
+cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
     uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -814,18 +835,35 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride);
-template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
+template void
+cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
+    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
+    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
+    uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
+    uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
+    pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint32_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
-template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
+template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
+template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
 template bool
 supports_distributed_shared_memory_on_classic_programmable_bootstrap<
     __uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 79e389056..d52a951b4 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -17,12 +17,13 @@
 #include "polynomial/polynomial_math.cuh"
 #include "types/complex/operations.cuh"
 
-template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD, bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_programmable_bootstrap_step_one(
         const Torus *__restrict__ lut_vector,
         const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
         const Torus *__restrict__ lwe_input_indexes, Torus *global_accumulator,
         double2 *global_join_buffer, uint32_t lwe_iteration,
         uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -55,7 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -75,8 +76,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     // First iteration
     // Put "b" in [0, 2N[
     constexpr auto log_modulus = params::log2_degree + 1;
-    Torus b_hat = 0;
-    Torus correction = 0;
+    InputTorus b_hat = 0;
+    InputTorus correction = 0;
     if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
       correction = centered_binary_modulus_switch_body_correction_to_add(
           block_lwe_array_in, lwe_dimension, log_modulus);
@@ -86,6 +87,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
     // The y-dimension is used to select the element of the GLWE this block will
     // compute
+    // b_hat is cast from InputTorus (i.e. 64b or 32b) to uint32_t here
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
         accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
@@ -100,7 +102,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
   }
 
   // Put "a" in [0, 2N[
-  Torus a_hat = 0;
+  InputTorus a_hat = 0;
   modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
                  params::log2_degree + 1); // 2 * params::log2_degree + 1);
 
@@ -307,7 +309,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
   return buffer_size + buffer_size % sizeof(double2);
 }
 
-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -331,30 +333,38 @@ __host__ uint64_t scratch_programmable_bootstrap(
   // Configure step one
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, true>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, true>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, false>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               PARTIALSM, false>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else if (max_shared_memory >= partial_sm) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, true>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, true>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, false>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
+        device_programmable_bootstrap_step_one<InputTorus, Torus, params,
+                                               FULLSM, false>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
@@ -398,10 +408,10 @@ __host__ uint64_t scratch_programmable_bootstrap(
   return size_tracker;
 }
 
-template <typename Torus, class params, bool first_iter>
+template <typename InputTorus, typename Torus, class params, bool first_iter>
 __host__ void execute_step_one(
     cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     Torus *global_accumulator, double2 *global_join_buffer,
     uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
@@ -416,21 +426,24 @@ __host__ void execute_step_one(
   dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
 
   if (max_shared_memory < partial_sm) {
-    device_programmable_bootstrap_step_one<Torus, params, NOSM, first_iter>
+    device_programmable_bootstrap_step_one<InputTorus, Torus, params, NOSM,
+                                           first_iter>
         <<<grid, thds, 0, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
             global_accumulator, global_join_buffer, lwe_iteration,
             lwe_dimension, polynomial_size, base_log, level_count, d_mem,
             full_dm, noise_reduction_type);
   } else if (max_shared_memory < full_sm) {
-    device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
+    device_programmable_bootstrap_step_one<InputTorus, Torus, params, PARTIALSM,
+                                           first_iter>
         <<<grid, thds, partial_sm, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
             global_accumulator, global_join_buffer, lwe_iteration,
             lwe_dimension, polynomial_size, base_log, level_count, d_mem,
             partial_dm, noise_reduction_type);
   } else {
-    device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
+    device_programmable_bootstrap_step_one<InputTorus, Torus, params, FULLSM,
+                                           first_iter>
         <<<grid, thds, full_sm, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
             global_accumulator, global_join_buffer, lwe_iteration,
@@ -484,11 +497,11 @@ __host__ void execute_step_two(
 /*
  * Host wrapper to the programmable bootstrap
  */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -520,7 +533,7 @@ __host__ void host_programmable_bootstrap(
 
   for (int i = 0; i < lwe_dimension; i++) {
     if (i == 0) {
-      execute_step_one<Torus, params, true>(
+      execute_step_one<InputTorus, Torus, params, true>(
           stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, bootstrapping_key, global_accumulator,
           global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
@@ -528,7 +541,7 @@ __host__ void host_programmable_bootstrap(
           partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
           noise_reduction_type);
     } else {
-      execute_step_one<Torus, params, false>(
+      execute_step_one<InputTorus, Torus, params, false>(
           stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, bootstrapping_key, global_accumulator,
           global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index 0a37ca66e..498ea0bd7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -7,15 +7,16 @@
 #include "programmable_bootstrap_tbc_multibit.cuh"
 #endif
 
+// Used in CPP benchmarks, not used in rust code
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t num_samples, uint32_t max_shared_memory) {
   return supports_cooperative_groups_on_multibit_programmable_bootstrap<
-      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
-                max_shared_memory);
+      uint64_t, uint64_t>(glwe_dimension, polynomial_size, level_count,
+                          num_samples, max_shared_memory);
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory) {
@@ -26,39 +27,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
   switch (polynomial_size) {
   case 256:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
   case 512:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
+                                                 polynomial_size, level_count,
+                                                 max_shared_memory);
   case 1024:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 2048:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 4096:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 8192:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
+                                                  polynomial_size, level_count,
+                                                  max_shared_memory);
   case 16384:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
-        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+        InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
+                                                   polynomial_size, level_count,
+                                                   max_shared_memory);
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
@@ -69,11 +70,11 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
 #endif
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -82,7 +83,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
 
   switch (polynomial_size) {
   case 256:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -90,7 +92,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 512:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -98,7 +101,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 1024:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -106,7 +110,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 2048:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -114,7 +119,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 4096:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -122,7 +128,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 8192:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -130,7 +137,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 16384:
-    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                             AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -144,11 +152,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
   }
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -157,7 +165,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
 
   switch (polynomial_size) {
   case 256:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -165,7 +174,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 512:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -173,7 +183,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 1024:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -181,7 +192,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 2048:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -189,7 +201,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 4096:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -197,7 +210,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 8192:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -205,7 +219,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 16384:
-    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                          AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -239,7 +254,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
   switch (buffer->pbs_variant) {
   case PBS_VARIANT::TBC:
 #if CUDA_ARCH >= 900
-    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                    uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
         static_cast<const uint64_t *>(lwe_output_indexes),
         static_cast<const uint64_t *>(lut_vector),
@@ -254,7 +270,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
 #endif
   case PBS_VARIANT::CG:
-    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                   uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
         static_cast<const uint64_t *>(lwe_output_indexes),
         static_cast<const uint64_t *>(lut_vector),
@@ -266,7 +283,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         num_samples, num_many_lut, lut_stride);
     break;
   case PBS_VARIANT::DEFAULT:
-    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
         static_cast<const uint64_t *>(lwe_output_indexes),
         static_cast<const uint64_t *>(lut_vector),
@@ -282,7 +300,73 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
   }
 }
 
-template <typename Torus>
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
+
+  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
+      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::TBC:
+#if CUDA_ARCH >= 900
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                    uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint32_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+#else
+    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
+#endif
+  case PBS_VARIANT::CG:
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                   uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint32_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                uint64_t>(
+        stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const uint64_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint32_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -290,43 +374,43 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
 
   switch (polynomial_size) {
   case 256:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 512:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 1024:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 2048:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 4096:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 8192:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 16384:
-    return scratch_cg_multi_bit_programmable_bootstrap<Torus,
+    return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                        AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -338,7 +422,7 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
   }
 }
 
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -346,43 +430,43 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
 
   switch (polynomial_size) {
   case 256:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 512:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 1024:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 2048:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 4096:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 8192:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 16384:
-    return scratch_multi_bit_programmable_bootstrap<Torus,
+    return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                     AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -400,7 +484,8 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   bool supports_cg =
-      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t,
+                                                                     uint64_t>(
           glwe_dimension, polynomial_size, level_count,
           input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
 #if (CUDA_ARCH >= 900)
@@ -412,24 +497,70 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
       &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
 
   bool supports_tbc =
-      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
+                                                               uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
           level_count, cuda_get_max_shared_memory(gpu_index));
 
   if (supports_tbc)
-    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t,
+                                                             uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory);
   else
 #endif
       if (supports_cg)
-    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory);
   else
-    return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
+    return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint32_t,
+                                                                     uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
+#if (CUDA_ARCH >= 900)
+  // On H100s we should be using TBC until num_samples < num_sms / 2.
+  // After that we switch to CG until not supported anymore.
+  // At this point we return to TBC.
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  bool supports_tbc =
+      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t,
+                                                               uint64_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          level_count, cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_tbc)
+    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint32_t,
+                                                             uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+#endif
+      if (supports_cg)
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
+        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+    return scratch_cuda_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         glwe_dimension, polynomial_size, level_count,
         input_lwe_ciphertext_count, allocate_gpu_memory);
@@ -455,7 +586,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
  * The value 13 was empirically determined based on memory requirements for
  * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
  */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
                             uint32_t polynomial_size, uint32_t glwe_dimension,
                             uint32_t level_count, uint64_t full_sm_keybundle) {
@@ -466,13 +597,14 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
   if (max_shared_memory < full_sm_keybundle)
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_blocks_per_sm,
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         polynomial_size / params::opt, full_sm_keybundle);
   else
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_blocks_per_sm,
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         polynomial_size / params::opt, 0);
 
   int num_sms = 0;
@@ -535,14 +667,15 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
   return (max_num_chunks > divisor) ? divisor : max_num_chunks;
 }
 
-template uint64_t scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
+template uint64_t
+scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template void
-cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
     uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -552,14 +685,16 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride);
 
-template uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
+template uint64_t
+scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template void
-cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                               uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
     uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -570,12 +705,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t num_many_lut, uint32_t lut_stride);
 
 template bool
-has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t, uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory);
 
 #if (CUDA_ARCH >= 900)
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -583,43 +718,43 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
 
   switch (polynomial_size) {
   case 256:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 512:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 1024:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 2048:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 4096:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 8192:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
         allocate_gpu_memory);
   case 16384:
-    return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
+    return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
                                                         AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -630,11 +765,11 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
           " in the interval [256..16384].")
   }
 }
-template <typename Torus>
+template <typename InputTorus, typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     void *stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -646,7 +781,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
 
   switch (polynomial_size) {
   case 256:
-    host_tbc_multi_bit_programmable_bootstrap<uint64_t, AmortizedDegree<256>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -654,7 +790,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 512:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -662,7 +799,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 1024:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -675,14 +813,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
 
     if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
-      host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+      host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                                AmortizedDegree<2048>>(
           static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
           lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
           lwe_dimension, polynomial_size, grouping_factor, base_log,
           level_count, num_samples, num_many_lut, lut_stride);
     else
-      host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
+      host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                                Degree<2048>>(
           static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
           lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -692,7 +832,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     break;
   }
   case 4096:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -700,7 +841,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 8192:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -708,7 +850,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         num_samples, num_many_lut, lut_stride);
     break;
   case 16384:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
+    host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
+                                              AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -722,13 +865,15 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
   }
 }
 
-template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
+template uint64_t
+scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
     void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template void
-cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
     uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -737,4 +882,16 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_many_lut, uint32_t lut_stride);
+
+template void
+cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
+                                                                uint64_t>(
+    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
+    uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
+    uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
+    uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
+    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride);
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 64c04050c..50b155218 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -25,9 +25,10 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
          level_count;
 }
 
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
     const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
     const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -55,7 +56,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 
   if (lwe_iteration < (lwe_dimension / grouping_factor)) {
 
-    const Torus *block_lwe_array_in =
+    const InputTorus *block_lwe_array_in =
         &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
 
     double2 *keybundle = keybundle_array +
@@ -86,10 +87,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     // Precalculate the monomial degrees and store them in shared memory
     uint32_t *monomial_degrees = (uint32_t *)selected_memory;
     if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
+      const InputTorus *lwe_array_group =
           block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      monomial_degrees[threadIdx.x] =
+          calculates_monomial_degree<InputTorus, params>(
+              lwe_array_group, threadIdx.x, grouping_factor);
     }
     __syncthreads();
 
@@ -145,9 +147,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 // Then we can just calculate the offset needed to apply this coefficients, and
 // the operation transforms into a pointwise vector multiplication, avoiding to
 // perform extra instructions other than MADD
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
     const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
     const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
     uint32_t lwe_offset, uint64_t lwe_chunk_size,
@@ -188,7 +191,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
 
   if (lwe_iteration < (lwe_dimension / grouping_factor)) {
 
-    const Torus *block_lwe_array_in =
+    const InputTorus *block_lwe_array_in =
         &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
 
     double2 *keybundle = keybundle_array +
@@ -219,10 +222,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
     uint32_t *monomial_degrees = (uint32_t *)selected_memory;
 
     if (threadIdx.x < (1 << grouping_factor)) {
-      const Torus *lwe_array_group =
+      const InputTorus *lwe_array_group =
           block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, threadIdx.x, grouping_factor);
+      monomial_degrees[threadIdx.x] =
+          calculates_monomial_degree<InputTorus, params>(
+              lwe_array_group, threadIdx.x, grouping_factor);
     }
     __syncthreads();
 
@@ -269,10 +273,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
   }
 }
 
-template <typename Torus, class params, sharedMemDegree SMD, bool is_first_iter>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD, bool is_first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_accumulate_step_one(
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
         const Torus *__restrict__ lwe_input_indexes,
         const Torus *__restrict__ lut_vector,
         const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
@@ -305,7 +310,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
   if constexpr (SMD == PARTIALSM)
     accumulator_fft = (double2 *)sharedmem;
 
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -327,7 +332,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     ////////////////////////////////////////////////////////////
     // Initializes the accumulator with the body of LWE
     // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
     modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                    params::log2_degree + 1);
 
@@ -501,7 +506,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
   return sizeof(Torus) * polynomial_size; // accumulator
 }
 
-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -526,20 +531,22 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
 
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
@@ -547,59 +554,59 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
   if (max_shared_memory < partial_sm_accumulate_step_one) {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, false>,
+            InputTorus, Torus, params, NOSM, false>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
     cudaFuncSetCacheConfig(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, false>,
+            InputTorus, Torus, params, NOSM, false>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, true>,
+            InputTorus, Torus, params, NOSM, true>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
     cudaFuncSetCacheConfig(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM, true>,
+            InputTorus, Torus, params, NOSM, true>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else if (max_shared_memory < full_sm_accumulate_step_one) {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, false>,
+            InputTorus, Torus, params, PARTIALSM, false>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         partial_sm_accumulate_step_one));
     cudaFuncSetCacheConfig(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, false>,
+            InputTorus, Torus, params, PARTIALSM, false>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, true>,
+            InputTorus, Torus, params, PARTIALSM, true>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         partial_sm_accumulate_step_one));
     cudaFuncSetCacheConfig(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM, true>,
+            InputTorus, Torus, params, PARTIALSM, true>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, false>,
+            InputTorus, Torus, params, FULLSM, false>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         full_sm_accumulate_step_one));
     cudaFuncSetCacheConfig(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, false>,
+            InputTorus, Torus, params, FULLSM, false>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, true>,
+            InputTorus, Torus, params, FULLSM, true>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         full_sm_accumulate_step_one));
     cudaFuncSetCacheConfig(
         device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM, true>,
+            InputTorus, Torus, params, FULLSM, true>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
@@ -644,7 +651,7 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
     check_cuda_error(cudaGetLastError());
   }
 
-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+  auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
       gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
       level_count, full_sm_keybundle);
   uint64_t size_tracker = 0;
@@ -655,9 +662,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
   return size_tracker;
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void execute_compute_keybundle(
-    cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
+    cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -686,7 +693,8 @@ __host__ void execute_compute_keybundle(
   dim3 thds(polynomial_size / params::opt, 1, 1);
 
   if (max_shared_memory < full_sm_keybundle) {
-    device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>
+    device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus, params,
+                                                      NOSM>
         <<<grid_keybundle, thds, 0, stream>>>(
             lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
             lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -694,7 +702,8 @@ __host__ void execute_compute_keybundle(
             d_mem, full_sm_keybundle);
   } else {
     bool supports_tbc =
-        has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+        has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<InputTorus,
+                                                                 Torus>(
             num_samples, glwe_dimension, polynomial_size, level_count,
             cuda_get_max_shared_memory(gpu_index));
 
@@ -703,20 +712,22 @@ __host__ void execute_compute_keybundle(
       dim3 thds_new_keybundle(512, 1, 1);
       check_cuda_error(cudaFuncSetAttribute(
           device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
+              InputTorus, Torus, Degree<2048>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
       cudaFuncSetCacheConfig(
           device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-              Torus, Degree<2048>, FULLSM>,
+              InputTorus, Torus, Degree<2048>, FULLSM>,
           cudaFuncCachePreferShared);
       check_cuda_error(cudaGetLastError());
       device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
-          Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
-                                         3 * full_sm_keybundle, stream>>>(
-          lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
-          lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
+          InputTorus, Torus, Degree<2048>, FULLSM>
+          <<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
+             stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
+                       bootstrapping_key, lwe_dimension, lwe_offset, chunk_size,
+                       keybundle_size_per_input);
     } else {
-      device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
+      device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                        params, FULLSM>
           <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
               lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
               lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -727,10 +738,10 @@ __host__ void execute_compute_keybundle(
   check_cuda_error(cudaGetLastError());
 }
 
-template <typename Torus, class params, bool is_first_iter>
+template <typename InputTorus, typename Torus, class params, bool is_first_iter>
 __host__ void execute_step_one(
     cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
     uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
@@ -754,7 +765,7 @@ __host__ void execute_step_one(
 
   if (max_shared_memory < partial_sm_accumulate_step_one)
     device_multi_bit_programmable_bootstrap_accumulate_step_one<
-        Torus, params, NOSM, is_first_iter>
+        InputTorus, Torus, params, NOSM, is_first_iter>
         <<<grid_accumulate_step_one, thds, 0, stream>>>(
             lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
             global_accumulator, global_accumulator_fft, lwe_dimension,
@@ -762,7 +773,7 @@ __host__ void execute_step_one(
             full_sm_accumulate_step_one);
   else if (max_shared_memory < full_sm_accumulate_step_one)
     device_multi_bit_programmable_bootstrap_accumulate_step_one<
-        Torus, params, PARTIALSM, is_first_iter>
+        InputTorus, Torus, params, PARTIALSM, is_first_iter>
         <<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
            stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
                      lut_vector_indexes, global_accumulator,
@@ -771,7 +782,7 @@ __host__ void execute_step_one(
                      partial_sm_accumulate_step_one);
   else
     device_multi_bit_programmable_bootstrap_accumulate_step_one<
-        Torus, params, FULLSM, is_first_iter>
+        InputTorus, Torus, params, FULLSM, is_first_iter>
         <<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
            stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
                      lut_vector_indexes, global_accumulator,
@@ -823,11 +834,11 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
   check_cuda_error(cudaGetLastError());
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -840,7 +851,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
        lwe_offset += lwe_chunk_size) {
 
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
+    execute_compute_keybundle<InputTorus, Torus, params>(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, level_count, lwe_offset);
@@ -853,12 +864,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
       bool is_last_iter =
           (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
       if (is_first_iter) {
-        execute_step_one<Torus, params, true>(
+        execute_step_one<InputTorus, Torus, params, true>(
             stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
             lwe_input_indexes, buffer, num_samples, lwe_dimension,
             glwe_dimension, polynomial_size, base_log, level_count);
       } else {
-        execute_step_one<Torus, params, false>(
+        execute_step_one<InputTorus, Torus, params, false>(
             stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
             lwe_input_indexes, buffer, num_samples, lwe_dimension,
             glwe_dimension, polynomial_size, base_log, level_count);
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
index 0a3dc9ac4..d0702df53 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -98,9 +98,9 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
 
   bool supports_cg =
       supports_cooperative_groups_on_multibit_programmable_bootstrap<
-          __uint128_t>(glwe_dimension, polynomial_size, level_count,
-                       input_lwe_ciphertext_count,
-                       cuda_get_max_shared_memory(gpu_index));
+          uint64_t, __uint128_t>(glwe_dimension, polynomial_size, level_count,
+                                 input_lwe_ciphertext_count,
+                                 cuda_get_max_shared_memory(gpu_index));
 
   if (supports_cg)
     return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index 758e7bd98..8e12342bf 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
  *
  * Each y-block computes one element of the lwe_array_out.
  */
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_tbc(
     Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
     const Torus *__restrict__ lut_vector,
     const Torus *__restrict__ lut_vector_indexes,
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
     const Torus *__restrict__ lwe_input_indexes,
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -83,7 +84,7 @@ __global__ void device_programmable_bootstrap_tbc(
 
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -99,8 +100,8 @@ __global__ void device_programmable_bootstrap_tbc(
 
   // Put "b" in [0, 2N[
   constexpr auto log_modulus = params::log2_degree + 1;
-  Torus b_hat = 0;
-  Torus correction = 0;
+  InputTorus b_hat = 0;
+  InputTorus correction = 0;
   if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
     correction = centered_binary_modulus_switch_body_correction_to_add(
         block_lwe_array_in, lwe_dimension, log_modulus);
@@ -117,7 +118,7 @@ __global__ void device_programmable_bootstrap_tbc(
     __syncthreads();
 
     // Put "a" in [0, 2N[
-    Torus a_hat = 0;
+    InputTorus a_hat = 0;
     modulus_switch(block_lwe_array_in[i], a_hat, log_modulus);
 
     // Perform ACC * (X^ä - 1)
@@ -200,12 +201,13 @@ __global__ void device_programmable_bootstrap_tbc(
   }
 }
 
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_tbc_2_2_params(
     Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
     const Torus *__restrict__ lut_vector,
     const Torus *__restrict__ lut_vector_indexes,
-    const Torus *__restrict__ lwe_array_in,
+    const InputTorus *__restrict__ lwe_array_in,
     const Torus *__restrict__ lwe_input_indexes,
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
@@ -247,7 +249,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
   }
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -263,8 +265,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
 
   // Put "b" in [0, 2N[
   constexpr auto log_modulus = params::log2_degree + 1;
-  Torus b_hat = 0;
-  Torus correction = 0;
+  InputTorus b_hat = 0;
+  InputTorus correction = 0;
   if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
     correction = centered_binary_modulus_switch_body_correction_to_add(
         block_lwe_array_in, lwe_dimension, log_modulus);
@@ -276,7 +278,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
                                         params::degree / params::opt>(
       accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
       false);
-  Torus temp_a_hat = 0;
+  InputTorus temp_a_hat = 0;
   for (int i = 0; i < lwe_dimension; i++) {
 
     // We calculate the modulus switch of a warp size of elements
@@ -382,7 +384,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
   cluster.sync();
 }
 
-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_tbc(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -410,27 +412,27 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
 
   if (max_shared_memory >= full_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         partial_sm + minimum_sm_tbc));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
     cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
@@ -446,11 +448,11 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
 /*
  * Host wrapper
  */
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_programmable_bootstrap_tbc(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -506,7 +508,8 @@ __host__ void host_programmable_bootstrap_tbc(
     config.dynamicSmemBytes = minimum_sm_tbc;
 
     check_cuda_error(cudaLaunchKernelEx(
-        &config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        &config,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
@@ -515,7 +518,8 @@ __host__ void host_programmable_bootstrap_tbc(
     config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
 
     check_cuda_error(cudaLaunchKernelEx(
-        &config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        &config,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem,
@@ -530,18 +534,22 @@ __host__ void host_programmable_bootstrap_tbc(
       config.dynamicSmemBytes = full_sm_2_2;
 
       check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_2_2));
       check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
           cudaFuncAttributePreferredSharedMemoryCarveout,
           cudaSharedmemCarveoutMaxShared));
       check_cuda_error(cudaFuncSetCacheConfig(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
           cudaFuncCachePreferShared));
       check_cuda_error(cudaLaunchKernelEx(
           &config,
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
           lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
           lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
           lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
@@ -549,7 +557,8 @@ __host__ void host_programmable_bootstrap_tbc(
       config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
 
       check_cuda_error(cudaLaunchKernelEx(
-          &config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+          &config,
+          device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
           lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
           lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
           lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
@@ -560,7 +569,7 @@ __host__ void host_programmable_bootstrap_tbc(
 }
 
 // Verify if the grid size satisfies the cooperative group constraints
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
     int glwe_dimension, int level_count, int num_samples,
     uint32_t max_shared_memory) {
@@ -585,18 +594,21 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
   if (max_shared_memory < partial_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
-        0);
+        (void *)
+            device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
+        thds, 0);
   } else if (max_shared_memory < full_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        (void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
+                                                  PARTIALSM>,
         thds, partial_sm);
   } else {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
-        (void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
-        full_sm);
+        (void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
+                                                  FULLSM>,
+        thds, full_sm);
   }
 
   // Get the number of streaming multiprocessors
@@ -621,7 +633,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
   }
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory) {
@@ -661,34 +673,39 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
   if (max_shared_memory < partial_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
-        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
-    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
-        &cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
-        &config));
-  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
         cudaFuncAttributeNonPortableClusterSizeAllowed, false));
     check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
         &cluster_size,
-        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
+        &config));
+  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, false));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size,
+        device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
+        &config));
   } else {
     if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
       check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
           cudaFuncAttributeNonPortableClusterSizeAllowed, false));
       check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
           &cluster_size,
-          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
+                                                       params, FULLSM>,
           &config));
     } else {
       check_cuda_error(cudaFuncSetAttribute(
-          device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+          device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributeNonPortableClusterSizeAllowed, false));
       check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
           &cluster_size,
-          device_programmable_bootstrap_tbc<Torus, params, FULLSM>, &config));
+          device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
+          &config));
     }
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 9891dfff8..29ea2ca21 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -18,13 +18,14 @@
 #include "types/complex/operations.cuh"
 #include <vector>
 
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_tbc_accumulate(
         Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
         const Torus *__restrict__ lut_vector,
         const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
         const Torus *__restrict__ lwe_input_indexes,
         const double2 *__restrict__ keybundle_array, double2 *join_buffer,
         Torus *global_accumulator, uint32_t lwe_dimension,
@@ -67,7 +68,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   // The first dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -87,7 +88,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
     modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                    params::log2_degree + 1);
 
@@ -197,13 +198,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 //- Use a register based fft that uses the minimal synchronizations
 //- Register based fourier domain multiplication. Transfer fft's between blocks
 // instead of accumulator.
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, typename Torus, class params,
+          sharedMemDegree SMD>
 __global__ void __launch_bounds__(params::degree / params::opt)
     device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params(
         Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
         const Torus *__restrict__ lut_vector,
         const Torus *__restrict__ lut_vector_indexes,
-        const Torus *__restrict__ lwe_array_in,
+        const InputTorus *__restrict__ lwe_array_in,
         const Torus *__restrict__ lwe_input_indexes,
         const double2 *__restrict__ keybundle_array, double2 *join_buffer,
         Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
@@ -247,7 +249,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   // The first dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
@@ -266,7 +268,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
   Torus reg_acc_rotated[params::opt];
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
-    Torus b_hat = 0;
+    InputTorus b_hat = 0;
     modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
                    params::log2_degree + 1);
 
@@ -400,7 +402,7 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
   return sizeof(Torus) * polynomial_size * 2; // accumulator
 }
 
-template <typename Torus, typename params>
+template <typename InputTorus, typename Torus, typename params>
 __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -430,20 +432,22 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
 
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, NOSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
+        device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
+                                                          params, FULLSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   }
@@ -451,58 +455,58 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
   if (max_shared_memory <
       partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         minimum_sm_tbc_accumulate));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else if (max_shared_memory <
              full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
     cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncCachePreferShared);
     check_cuda_error(cudaGetLastError());
   } else {
     if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
       check_cuda_error(cudaFuncSetAttribute(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize,
           full_sm_tbc_accumulate + 2 * minimum_sm_tbc_accumulate));
       check_cuda_error(cudaFuncSetAttribute(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributePreferredSharedMemoryCarveout,
           cudaSharedmemCarveoutMaxShared));
       check_cuda_error(cudaFuncSetCacheConfig(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncCachePreferShared));
     } else {
       check_cuda_error(cudaFuncSetAttribute(
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize,
           full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
       cudaFuncSetCacheConfig(
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncCachePreferShared);
     }
     check_cuda_error(cudaGetLastError());
   }
 
-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+  auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
       gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
       level_count, full_sm_keybundle);
   uint64_t size_tracker = 0;
@@ -513,10 +517,10 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
   return size_tracker;
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void execute_tbc_external_product_loop(
     cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
     uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -579,8 +583,8 @@ __host__ void execute_tbc_external_product_loop(
     config.dynamicSmemBytes = minimum_dm;
     check_cuda_error(cudaLaunchKernelEx(
         &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -591,8 +595,8 @@ __host__ void execute_tbc_external_product_loop(
     config.dynamicSmemBytes = partial_dm + minimum_dm;
     check_cuda_error(cudaLaunchKernelEx(
         &config,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -607,22 +611,22 @@ __host__ void execute_tbc_external_product_loop(
       config.dynamicSmemBytes = full_dm + 2 * minimum_dm;
       check_cuda_error(cudaFuncSetAttribute(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize,
           full_dm + 2 * minimum_dm));
       check_cuda_error(cudaFuncSetAttribute(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributePreferredSharedMemoryCarveout,
           cudaSharedmemCarveoutMaxShared));
       check_cuda_error(cudaFuncSetCacheConfig(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncCachePreferShared));
       check_cuda_error(cudaLaunchKernelEx(
           &config,
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
           lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
           global_accumulator, lwe_dimension, lwe_offset, chunk_size,
@@ -630,8 +634,8 @@ __host__ void execute_tbc_external_product_loop(
     } else {
       check_cuda_error(cudaLaunchKernelEx(
           &config,
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
           lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
           lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
           global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -642,11 +646,11 @@ __host__ void execute_tbc_external_product_loop(
   }
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ void host_tbc_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus const *lwe_output_indexes, Torus const *lut_vector,
-    Torus const *lut_vector_indexes, Torus const *lwe_array_in,
+    Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
     Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -659,13 +663,13 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
        lwe_offset += lwe_chunk_size) {
 
     // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
+    execute_compute_keybundle<InputTorus, Torus, params>(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, level_count, lwe_offset);
 
     // Accumulate
-    execute_tbc_external_product_loop<Torus, params>(
+    execute_tbc_external_product_loop<InputTorus, Torus, params>(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
@@ -690,7 +694,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
   }
 }
 
-template <typename Torus, class params>
+template <typename InputTorus, typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t max_shared_memory) {
@@ -732,45 +736,45 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
   if (max_shared_memory <
       partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
         cudaFuncAttributeNonPortableClusterSizeAllowed, false));
     check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
         &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               NOSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, NOSM>,
         &config));
   } else if (max_shared_memory <
              full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         cudaFuncAttributeNonPortableClusterSizeAllowed, false));
     check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
         &cluster_size,
-        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                               PARTIALSM>,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<
+            InputTorus, Torus, params, PARTIALSM>,
         &config));
   } else {
     if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
       check_cuda_error(cudaFuncSetAttribute(
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributeNonPortableClusterSizeAllowed, false));
       check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
           &cluster_size,
           device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
-              Torus, params, FULLSM>,
+              InputTorus, Torus, params, FULLSM>,
           &config));
     } else {
       check_cuda_error(cudaFuncSetAttribute(
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
           cudaFuncAttributeNonPortableClusterSizeAllowed, false));
       check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
           &cluster_size,
-          device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
-                                                                 FULLSM>,
+          device_multi_bit_programmable_bootstrap_tbc_accumulate<
+              InputTorus, Torus, params, FULLSM>,
           &config));
     }
   }
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
index 7196b9210..8ad6ab768 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -166,14 +166,15 @@ public:
 #if CUDA_ARCH >= 900
 BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
 (benchmark::State &st) {
-  if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+  if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
+                                                                uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
           pbs_level, cuda_get_max_shared_memory(0))) {
     st.SkipWithError("Configuration not supported for tbc operation");
     return;
   }
 
-  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
+  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
       stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
       true);
@@ -181,7 +182,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
-    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
+                                                                    uint64_t>(
         stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
         d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
         d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
@@ -197,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
 
 BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
 (benchmark::State &st) {
-  if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
+  if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit_64(
           glwe_dimension, polynomial_size, pbs_level,
           input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
     st.SkipWithError("Configuration not supported for fast operation");
@@ -254,7 +256,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
 #if CUDA_ARCH >= 900
 BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
 (benchmark::State &st) {
-  if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+  if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
           pbs_level, cuda_get_max_shared_memory(0))) {
     st.SkipWithError("Configuration not supported for tbc operation");
@@ -269,7 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
-    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
         stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
         (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
         (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
@@ -301,7 +303,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
   uint32_t lut_stride = 0;
   for (auto _ : st) {
     // Execute PBS
-    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
+    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
         stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
         (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
         (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
index f25653fcd..f671271bd 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -187,7 +187,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
           (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
                       (lwe_dimension + 1));
       // Execute PBS
-      cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+      cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
           stream, gpu_index, (void *)d_lwe_ct_out_array,
           (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index d58512a03..5eed4f0f8 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -3020,6 +3020,20 @@ unsafe extern "C" {
         noise_reduction_type: PBS_MS_REDUCTION_T,
     ) -> u64;
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_programmable_bootstrap_32_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
 unsafe extern "C" {
     pub fn scratch_cuda_programmable_bootstrap_128(
         stream: *mut ffi::c_void,
@@ -3035,7 +3049,7 @@ unsafe extern "C" {
     ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         lwe_array_out: *mut ffi::c_void,
@@ -3057,7 +3071,7 @@ unsafe extern "C" {
     );
 }
 unsafe extern "C" {
-    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
         stream: *mut ffi::c_void,
         gpu_index: u32,
         lwe_array_out: *mut ffi::c_void,
@@ -3110,6 +3124,7 @@ unsafe extern "C" {
     );
 }
 unsafe extern "C" {
+    #[link_name = "\u{1}_Z55has_support_to_cuda_programmable_bootstrap_cg_multi_bitjjjjj"]
     pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
         glwe_dimension: u32,
         polynomial_size: u32,
@@ -3144,6 +3159,18 @@ unsafe extern "C" {
         grouping_factor: u32,
     );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
 unsafe extern "C" {
     pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
         stream: *mut ffi::c_void,
@@ -3179,6 +3206,29 @@ unsafe extern "C" {
         lut_stride: u32,
     );
 }
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
 unsafe extern "C" {
     pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
         stream: *mut ffi::c_void,
diff --git a/tfhe-benchmark/benches/core_crypto/ks_bench.rs b/tfhe-benchmark/benches/core_crypto/ks_bench.rs
index 285a2019a..0c416ebf2 100644
--- a/tfhe-benchmark/benches/core_crypto/ks_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_bench.rs
@@ -415,7 +415,7 @@ mod cuda {
             let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
             let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
 
-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<Scalar, KeyswitchScalar> = CpuKeysBuilder::new()
                 .keyswitch_key(ksk_big_to_small)
                 .build();
 
@@ -667,7 +667,8 @@ mod cuda {
                 &mut encryption_generator,
             );
 
-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
+            let cpu_keys: CpuKeys<Scalar, Scalar> =
+                CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
 
             let bench_id;
             match get_bench_type() {
diff --git a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
index f70d5b7b0..fc79f6984 100644
--- a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
@@ -511,12 +511,19 @@ mod cuda {
     };
     use tfhe::core_crypto::prelude::*;
 
-    fn cuda_ks_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize>(
+    fn cuda_ks_pbs<
+        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
+        KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
+    >(
         c: &mut Criterion,
         parameters: &[(String, CryptoParametersRecord<Scalar>)],
     ) {
-        let bench_name = "core_crypto::cuda::ks_pbs";
-        let mut bench_group = c.benchmark_group(bench_name);
+        let bench_name = format!(
+            "core_crypto::cuda::ks_pbs::ks{}::pbs{}",
+            KeyswitchScalar::BITS,
+            Scalar::BITS
+        );
+        let mut bench_group = c.benchmark_group(bench_name.clone());
         bench_group
             .sample_size(10)
             .measurement_time(std::time::Duration::from_secs(30));
@@ -535,29 +542,57 @@ mod cuda {
                 continue;
             }
 
-            // Create the LweSecretKey
-            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
-                params.lwe_dimension.unwrap(),
-                &mut secret_generator,
-            );
-            let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
+            let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
+                DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_gaussian(
+                        gaussian_lwe_noise_distribution.standard_dev(),
+                    )
+                }
+                DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
+                        match KeyswitchScalar::BITS {
+                            32 => uniform_lwe_noise_distribution.bound_log2() - 32,
+                            64 => uniform_lwe_noise_distribution.bound_log2(),
+                            _ => panic!("Unsupported Keyswitch scalar input dtype"),
+                        },
+                    )
+                }
+            };
+
+            let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
+                allocate_and_generate_new_binary_lwe_secret_key(
+                    params.lwe_dimension.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
                 allocate_and_generate_new_binary_glwe_secret_key(
                     params.glwe_dimension.unwrap(),
                     params.polynomial_size.unwrap(),
                     &mut secret_generator,
                 );
-            let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
+            let big_lwe_sk = glwe_sk.into_lwe_secret_key();
 
             let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
-                &output_lwe_secret_key,
-                &input_lwe_secret_key,
+                &big_lwe_sk,
+                &lwe_sk,
                 params.ks_base_log.unwrap(),
                 params.ks_level.unwrap(),
-                params.lwe_noise_distribution.unwrap(),
+                lwe_noise_distribution_ksk,
                 CiphertextModulus::new_native(),
                 &mut encryption_generator,
             );
 
+            let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
+                allocate_and_generate_new_binary_glwe_secret_key(
+                    params.glwe_dimension.unwrap(),
+                    params.polynomial_size.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
+            let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
+
             let bsk = LweBootstrapKey::new(
                 Scalar::ZERO,
                 params.glwe_dimension.unwrap().to_glwe_size(),
@@ -568,7 +603,7 @@ mod cuda {
                 params.ciphertext_modulus.unwrap(),
             );
 
-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
                 .keyswitch_key(ksk_big_to_small)
                 .bootstrap_key(bsk)
                 .build();
@@ -582,7 +617,7 @@ mod cuda {
 
                     // Allocate a new LweCiphertext and encrypt our plaintext
                     let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
-                        &output_lwe_secret_key,
+                        &big_lwe_sk_64,
                         Plaintext(Scalar::ZERO),
                         params.lwe_noise_distribution.unwrap(),
                         params.ciphertext_modulus.unwrap(),
@@ -591,10 +626,10 @@ mod cuda {
                     let input_ks_ct_gpu =
                         CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);
 
-                    let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
-                        Scalar::ZERO,
-                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
-                        params.ciphertext_modulus.unwrap(),
+                    let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
+                        KeyswitchScalar::ZERO,
+                        lwe_sk.lwe_dimension().to_lwe_size(),
+                        ciphertext_modulus_out,
                     );
                     let mut output_ks_ct_gpu =
                         CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
@@ -611,7 +646,7 @@ mod cuda {
                     // Allocate the LweCiphertext to store the result of the PBS
                     let output_pbs_ct = LweCiphertext::new(
                         Scalar::ZERO,
-                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                         params.ciphertext_modulus.unwrap(),
                     );
                     let mut output_pbs_ct_gpu =
@@ -671,12 +706,12 @@ mod cuda {
                                 .map(|i| {
                                     let mut input_ks_list = LweCiphertextList::new(
                                         Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                         LweCiphertextCount(elements_per_stream),
                                         params.ciphertext_modulus.unwrap(),
                                     );
                                     encrypt_lwe_ciphertext_list(
-                                        &output_lwe_secret_key,
+                                        &big_lwe_sk_64,
                                         &mut input_ks_list,
                                         &plaintext_list,
                                         params.lwe_noise_distribution.unwrap(),
@@ -692,10 +727,10 @@ mod cuda {
                             let output_ks_cts = (0..gpu_count)
                                 .map(|i| {
                                     let output_ks_list = LweCiphertextList::new(
-                                        Scalar::ZERO,
-                                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        KeyswitchScalar::ZERO,
+                                        lwe_sk.lwe_dimension().to_lwe_size(),
                                         LweCiphertextCount(elements_per_stream),
-                                        params.ciphertext_modulus.unwrap(),
+                                        ciphertext_modulus_out,
                                     );
                                     CudaLweCiphertextList::from_lwe_ciphertext_list(
                                         &output_ks_list,
@@ -724,7 +759,7 @@ mod cuda {
                                 .map(|i| {
                                     let output_pbs_ct = LweCiphertextList::new(
                                         Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                         LweCiphertextCount(elements_per_stream),
                                         params.ciphertext_modulus.unwrap(),
                                     );
@@ -822,12 +857,18 @@ mod cuda {
 
     fn cuda_multi_bit_ks_pbs<
         Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Default + Serialize + Sync,
+        KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
     >(
         c: &mut Criterion,
         parameters: &[(String, CryptoParametersRecord<Scalar>, LweBskGroupingFactor)],
     ) {
-        let bench_name = "core_crypto::cuda::multi_bit_ks_pbs";
-        let mut bench_group = c.benchmark_group(bench_name);
+        let bench_name = format!(
+            "core_crypto::cuda::multi_bit_ks_pbs::ks_pbs::ks{}::pbs{}",
+            KeyswitchScalar::BITS,
+            Scalar::BITS
+        );
+
+        let mut bench_group = c.benchmark_group(bench_name.clone());
         bench_group
             .sample_size(10)
             .measurement_time(std::time::Duration::from_secs(30));
@@ -846,29 +887,57 @@ mod cuda {
                 continue;
             }
 
-            // Create the LweSecretKey
-            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
-                params.lwe_dimension.unwrap(),
-                &mut secret_generator,
-            );
-            let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
+            let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
+                DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_gaussian(
+                        gaussian_lwe_noise_distribution.standard_dev(),
+                    )
+                }
+                DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
+                    DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
+                        match KeyswitchScalar::BITS {
+                            32 => uniform_lwe_noise_distribution.bound_log2() - 32,
+                            64 => uniform_lwe_noise_distribution.bound_log2(),
+                            _ => panic!("Unsupported Keyswitch scalar input dtype"),
+                        },
+                    )
+                }
+            };
+
+            let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
+                allocate_and_generate_new_binary_lwe_secret_key(
+                    params.lwe_dimension.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
                 allocate_and_generate_new_binary_glwe_secret_key(
                     params.glwe_dimension.unwrap(),
                     params.polynomial_size.unwrap(),
                     &mut secret_generator,
                 );
-            let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
+            let big_lwe_sk = glwe_sk.into_lwe_secret_key();
 
             let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
-                &output_lwe_secret_key,
-                &input_lwe_secret_key,
+                &big_lwe_sk,
+                &lwe_sk,
                 params.ks_base_log.unwrap(),
                 params.ks_level.unwrap(),
-                params.lwe_noise_distribution.unwrap(),
+                lwe_noise_distribution_ksk,
                 CiphertextModulus::new_native(),
                 &mut encryption_generator,
             );
 
+            let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
+                allocate_and_generate_new_binary_glwe_secret_key(
+                    params.glwe_dimension.unwrap(),
+                    params.polynomial_size.unwrap(),
+                    &mut secret_generator,
+                );
+
+            let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
+            let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
+
             let multi_bit_bsk = LweMultiBitBootstrapKey::new(
                 Scalar::ZERO,
                 params.glwe_dimension.unwrap().to_glwe_size(),
@@ -880,7 +949,7 @@ mod cuda {
                 params.ciphertext_modulus.unwrap(),
             );
 
-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
                 .keyswitch_key(ksk_big_to_small)
                 .multi_bit_bootstrap_key(multi_bit_bsk)
                 .build();
@@ -894,7 +963,7 @@ mod cuda {
 
                     // Allocate a new LweCiphertext and encrypt our plaintext
                     let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
-                        &output_lwe_secret_key,
+                        &big_lwe_sk_64,
                         Plaintext(Scalar::ZERO),
                         params.lwe_noise_distribution.unwrap(),
                         params.ciphertext_modulus.unwrap(),
@@ -903,10 +972,10 @@ mod cuda {
                     let input_ks_ct_gpu =
                         CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);
 
-                    let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
-                        Scalar::ZERO,
-                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
-                        params.ciphertext_modulus.unwrap(),
+                    let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
+                        KeyswitchScalar::ZERO,
+                        lwe_sk.lwe_dimension().to_lwe_size(),
+                        ciphertext_modulus_out,
                     );
                     let mut output_ks_ct_gpu =
                         CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
@@ -923,7 +992,7 @@ mod cuda {
                     // Allocate the LweCiphertext to store the result of the PBS
                     let output_pbs_ct = LweCiphertext::new(
                         Scalar::ZERO,
-                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                         params.ciphertext_modulus.unwrap(),
                     );
                     let mut output_pbs_ct_gpu =
@@ -981,12 +1050,12 @@ mod cuda {
                                 .map(|i| {
                                     let mut input_ks_list = LweCiphertextList::new(
                                         Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                         LweCiphertextCount(elements_per_stream),
                                         params.ciphertext_modulus.unwrap(),
                                     );
                                     encrypt_lwe_ciphertext_list(
-                                        &output_lwe_secret_key,
+                                        &big_lwe_sk_64,
                                         &mut input_ks_list,
                                         &plaintext_list,
                                         params.lwe_noise_distribution.unwrap(),
@@ -1002,10 +1071,10 @@ mod cuda {
                             let output_ks_cts = (0..gpu_count)
                                 .map(|i| {
                                     let output_ks_list = LweCiphertextList::new(
-                                        Scalar::ZERO,
-                                        input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        KeyswitchScalar::ZERO,
+                                        lwe_sk.lwe_dimension().to_lwe_size(),
                                         LweCiphertextCount(elements_per_stream),
-                                        params.ciphertext_modulus.unwrap(),
+                                        ciphertext_modulus_out,
                                     );
                                     CudaLweCiphertextList::from_lwe_ciphertext_list(
                                         &output_ks_list,
@@ -1034,7 +1103,7 @@ mod cuda {
                                 .map(|i| {
                                     let output_pbs_ct = LweCiphertextList::new(
                                         Scalar::ZERO,
-                                        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                        big_lwe_sk_64.lwe_dimension().to_lwe_size(),
                                         LweCiphertextCount(elements_per_stream),
                                         params.ciphertext_modulus.unwrap(),
                                     );
@@ -1132,12 +1201,17 @@ mod cuda {
 
     pub fn cuda_ks_pbs_group() {
         let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
-        cuda_ks_pbs(&mut criterion, &benchmark_parameters());
+        cuda_ks_pbs::<u64, u32>(&mut criterion, &benchmark_parameters());
+        cuda_ks_pbs::<u64, u64>(&mut criterion, &benchmark_parameters());
     }
 
     pub fn cuda_multi_bit_ks_pbs_group() {
         let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
-        cuda_multi_bit_ks_pbs(
+        cuda_multi_bit_ks_pbs::<u64, u32>(
+            &mut criterion,
+            &multi_bit_benchmark_parameters_with_grouping(),
+        );
+        cuda_multi_bit_ks_pbs::<u64, u64>(
             &mut criterion,
             &multi_bit_benchmark_parameters_with_grouping(),
         );
diff --git a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
index 3c7503252..e0f6a9d7d 100644
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -250,7 +250,7 @@ mod cuda {
                 }
             };
 
-        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
+        let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
 
         let message_modulus: u64 = 1 << 4;
         let input_message: u64 = 3;
@@ -493,7 +493,7 @@ mod cuda {
             squash_params.ciphertext_modulus,
         );
 
-        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+        let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new()
             .multi_bit_bootstrap_key(multi_bit_bsk)
             .build();
 
diff --git a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
index 4e8044bd3..d00554e9e 100644
--- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
@@ -978,7 +978,8 @@ mod cuda {
                 params.ciphertext_modulus.unwrap(),
             );
 
-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
+            let cpu_keys: CpuKeys<Scalar, Scalar> =
+                CpuKeysBuilder::new().bootstrap_key(bsk).build();
 
             let bench_id;
 
@@ -1236,7 +1237,7 @@ mod cuda {
                 params.ciphertext_modulus.unwrap(),
             );
 
-            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            let cpu_keys: CpuKeys<Scalar, Scalar> = CpuKeysBuilder::new()
                 .multi_bit_bootstrap_key(multi_bit_bsk)
                 .build();
 
diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs
index fc16e098c..5c71e69fe 100644
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -575,28 +575,28 @@ mod cuda_utils {
     }
 
     /// Computing keys in their CPU flavor.
-    pub struct CpuKeys<T: UnsignedInteger> {
-        ksk: Option<LweKeyswitchKeyOwned<T>>,
+    pub struct CpuKeys<T: UnsignedInteger, KST: UnsignedInteger> {
+        ksk: Option<LweKeyswitchKeyOwned<KST>>,
         pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
         bsk: Option<LweBootstrapKeyOwned<T>>,
         multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
     }
 
-    impl<T: UnsignedInteger> CpuKeys<T> {
-        pub fn builder() -> CpuKeysBuilder<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeys<T, KST> {
+        pub fn builder() -> CpuKeysBuilder<T, KST> {
             CpuKeysBuilder::new()
         }
     }
 
-    pub struct CpuKeysBuilder<T: UnsignedInteger> {
-        ksk: Option<LweKeyswitchKeyOwned<T>>,
+    pub struct CpuKeysBuilder<T: UnsignedInteger, KST: UnsignedInteger> {
+        ksk: Option<LweKeyswitchKeyOwned<KST>>,
         pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
         bsk: Option<LweBootstrapKeyOwned<T>>,
         multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
     }
 
-    impl<T: UnsignedInteger> CpuKeysBuilder<T> {
-        pub fn new() -> CpuKeysBuilder<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeysBuilder<T, KST> {
+        pub fn new() -> CpuKeysBuilder<T, KST> {
             Self {
                 ksk: None,
                 pksk: None,
@@ -605,7 +605,7 @@ mod cuda_utils {
             }
         }
 
-        pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<T>) -> CpuKeysBuilder<T> {
+        pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<KST>) -> CpuKeysBuilder<T, KST> {
             self.ksk = Some(ksk);
             self
         }
@@ -613,12 +613,12 @@ mod cuda_utils {
         pub fn packing_keyswitch_key(
             mut self,
             pksk: LwePackingKeyswitchKeyOwned<T>,
-        ) -> CpuKeysBuilder<T> {
+        ) -> CpuKeysBuilder<T, KST> {
             self.pksk = Some(pksk);
             self
         }
 
-        pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T> {
+        pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T, KST> {
             self.bsk = Some(bsk);
             self
         }
@@ -626,12 +626,12 @@ mod cuda_utils {
         pub fn multi_bit_bootstrap_key(
             mut self,
             mb_bsk: LweMultiBitBootstrapKeyOwned<T>,
-        ) -> CpuKeysBuilder<T> {
+        ) -> CpuKeysBuilder<T, KST> {
             self.multi_bit_bsk = Some(mb_bsk);
             self
         }
 
-        pub fn build(self) -> CpuKeys<T> {
+        pub fn build(self) -> CpuKeys<T, KST> {
             CpuKeys {
                 ksk: self.ksk,
                 pksk: self.pksk,
@@ -640,7 +640,7 @@ mod cuda_utils {
             }
         }
     }
-    impl<T: UnsignedInteger> Default for CpuKeysBuilder<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> Default for CpuKeysBuilder<T, KST> {
         fn default() -> Self {
             Self::new()
         }
@@ -648,17 +648,17 @@ mod cuda_utils {
 
     /// Computing keys in their Cuda flavor.
     #[allow(dead_code)]
-    pub struct CudaLocalKeys<T: UnsignedInteger> {
-        pub ksk: Option<CudaLweKeyswitchKey<T>>,
+    pub struct CudaLocalKeys<T: UnsignedInteger, KST: UnsignedInteger> {
+        pub ksk: Option<CudaLweKeyswitchKey<KST>>,
         pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
         pub bsk: Option<CudaLweBootstrapKey>,
         pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
     }
 
     #[allow(dead_code)]
-    impl<T: UnsignedInteger> CudaLocalKeys<T> {
+    impl<T: UnsignedInteger, KST: UnsignedInteger> CudaLocalKeys<T, KST> {
         pub fn from_cpu_keys(
-            cpu_keys: &CpuKeys<T>,
+            cpu_keys: &CpuKeys<T, KST>,
             ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
             stream: &CudaStreams,
         ) -> Self {
@@ -681,10 +681,10 @@ mod cuda_utils {
     }
 
     /// Instantiate Cuda computing keys to each available GPU.
-    pub fn cuda_local_keys_core<T: UnsignedInteger>(
-        cpu_keys: &CpuKeys<T>,
+    pub fn cuda_local_keys_core<T: UnsignedInteger, KST: UnsignedInteger>(
+        cpu_keys: &CpuKeys<T, KST>,
         ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
-    ) -> Vec<CudaLocalKeys<T>> {
+    ) -> Vec<CudaLocalKeys<T, KST>> {
         let gpu_count = get_number_of_gpus() as usize;
         let mut gpu_keys_vec = Vec::with_capacity(gpu_count);
         for i in 0..gpu_count {
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
index 326ad192c..1143a6140 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
@@ -10,8 +10,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
 /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
 ///   be dropped until streams is synchronised
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
     output: &mut CudaLweCiphertextList<Scalar>,
     accumulator: &CudaGlweCiphertextList<Scalar>,
     lut_indexes: &CudaVec<Scalar>,
@@ -20,6 +20,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
     multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
     streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
     // CastInto required for PBS modulus switch which returns a usize
     Scalar: UnsignedTorus + CastInto<usize>,
 {
@@ -59,21 +60,22 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
         multi_bit_bsk.polynomial_size(),
     );
 
-    assert_eq!(
-        input.ciphertext_modulus(),
-        output.ciphertext_modulus(),
+    assert!(
+        input.ciphertext_modulus().associated_scalar_bits()
+            <= output.ciphertext_modulus().associated_scalar_bits(),
         "Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
         input.ciphertext_modulus(),
         output.ciphertext_modulus(),
     );
 
     assert_eq!(
-        input.ciphertext_modulus(),
+        output.ciphertext_modulus(),
         accumulator.ciphertext_modulus(),
-        "Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
+        "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
         input.ciphertext_modulus(),
         accumulator.ciphertext_modulus(),
     );
+
     assert_eq!(
         streams.gpu_indexes[0],
         multi_bit_bsk.d_vec.gpu_index(0),
@@ -144,8 +146,8 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
 }
 
 #[allow(clippy::too_many_arguments)]
-pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
     output: &mut CudaLweCiphertextList<Scalar>,
     accumulator: &CudaGlweCiphertextList<Scalar>,
     lut_indexes: &CudaVec<Scalar>,
@@ -154,6 +156,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
     multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
     streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
     // CastInto required for PBS modulus switch which returns a usize
     Scalar: UnsignedTorus + CastInto<usize>,
 {
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
index 6406b544c..98842d21c 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
@@ -12,8 +12,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
 /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
 ///   be dropped until streams is synchronised
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
     output: &mut CudaLweCiphertextList<Scalar>,
     accumulator: &CudaGlweCiphertextList<Scalar>,
     lut_indexes: &CudaVec<Scalar>,
@@ -22,6 +22,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
     bsk: &CudaLweBootstrapKey,
     streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
     // CastInto required for PBS modulus switch which returns a usize
     Scalar: UnsignedTorus + CastInto<usize>,
 {
@@ -259,8 +260,8 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
 }
 
 #[allow(clippy::too_many_arguments)]
-pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
-    input: &CudaLweCiphertextList<Scalar>,
+pub fn cuda_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
+    input: &CudaLweCiphertextList<InputScalar>,
     output: &mut CudaLweCiphertextList<Scalar>,
     accumulator: &CudaGlweCiphertextList<Scalar>,
     lut_indexes: &CudaVec<Scalar>,
@@ -269,6 +270,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
     bsk: &CudaLweBootstrapKey,
     streams: &CudaStreams,
 ) where
+    InputScalar: UnsignedTorus + CastInto<usize>,
     Scalar: UnsignedTorus + CastInto<usize>,
 {
     unsafe {
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs
index 2473aa3bd..11b368708 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_programmable_bootstrapping.rs
@@ -4,8 +4,11 @@ use crate::core_crypto::gpu::lwe_bootstrap_key::{
     CudaLweBootstrapKey, CudaModulusSwitchNoiseReductionConfiguration,
 };
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
 use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
-use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
+use crate::core_crypto::gpu::{
+    cuda_keyswitch_lwe_ciphertext, cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams,
+};
 use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
 use itertools::Itertools;
 
@@ -174,6 +177,218 @@ fn lwe_encrypt_pbs_decrypt<
     }
 }
 
+fn lwe_ks_encrypt_pbs_decrypt_compare_32_64<
+    Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + CastFrom<u32> + CastInto<u32>,
+>(
+    params: ClassicTestParams<Scalar>,
+) {
+    assert!(Scalar::BITS <= 64);
+
+    let input_lwe_dimension = params.lwe_dimension;
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let glwe_noise_distribution = params.glwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+    let glwe_dimension = params.glwe_dimension;
+    let polynomial_size = params.polynomial_size;
+    let decomp_base_log = params.pbs_base_log;
+    let decomp_level_count = params.pbs_level;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let mut rsc = TestResources::new();
+
+    let f = |x: Scalar| {
+        x.wrapping_mul(Scalar::TWO)
+            .wrapping_sub(Scalar::ONE)
+            .wrapping_rem(msg_modulus)
+    };
+
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+    let mut msg = msg_modulus;
+    const NB_TESTS: usize = 10;
+    let number_of_messages = 1;
+
+    let accumulator = generate_programmable_bootstrap_glwe_lut(
+        polynomial_size,
+        glwe_dimension.to_glwe_size(),
+        msg_modulus.cast_into(),
+        ciphertext_modulus,
+        delta,
+        f,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &accumulator,
+        ciphertext_modulus
+    ));
+
+    let lwe_noise_distribution_ksk_32 = match params.lwe_noise_distribution {
+        DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
+            DynamicDistribution::<u32>::new_gaussian(gaussian_lwe_noise_distribution.standard_dev())
+        }
+        DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
+            DynamicDistribution::<u32>::new_t_uniform(
+                uniform_lwe_noise_distribution.bound_log2() - 32,
+            )
+        }
+    };
+
+    // Create the LweSecretKey
+    let intermediate_lwe_secret_key_32: LweSecretKeyOwned<u32> =
+        allocate_and_generate_new_binary_lwe_secret_key(
+            input_lwe_dimension,
+            &mut rsc.secret_random_generator,
+        ); // lwe_sk
+
+    let glwe_sk_64: GlweSecretKeyOwned<Scalar> = allocate_and_generate_new_binary_glwe_secret_key(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    let big_lwe_sk_64 = glwe_sk_64.clone().into_lwe_secret_key();
+
+    let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
+        &big_lwe_sk_64,
+        &intermediate_lwe_secret_key_32,
+        params.ks_base_log,
+        params.ks_level,
+        lwe_noise_distribution_ksk_32,
+        CiphertextModulus::<u32>::new_native(),
+        &mut rsc.encryption_random_generator,
+    );
+    let d_ksk_big_to_small =
+        CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream);
+
+    let ciphertext_modulus_ks32 = CiphertextModulus::<u32>::new_native();
+
+    let output_lwe_dimension = big_lwe_sk_64.lwe_dimension();
+
+    let mut bsk = LweBootstrapKey::new(
+        Scalar::ZERO,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        decomp_base_log,
+        decomp_level_count,
+        input_lwe_dimension,
+        ciphertext_modulus,
+    );
+
+    par_generate_lwe_bootstrap_key(
+        &intermediate_lwe_secret_key_32,
+        &glwe_sk_64,
+        &mut bsk,
+        glwe_noise_distribution,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &*bsk,
+        ciphertext_modulus
+    ));
+
+    let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let plaintext = Plaintext(msg * delta);
+
+            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
+                &big_lwe_sk_64,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus,
+                &mut rsc.encryption_random_generator,
+            );
+
+            assert!(check_encrypted_content_respects_mod(
+                &lwe_ciphertext_in,
+                ciphertext_modulus
+            ));
+
+            let d_lwe_ciphertext_in =
+                CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
+
+            let output_ks_ct: LweCiphertextOwned<u32> = LweCiphertext::new(
+                0u32,
+                intermediate_lwe_secret_key_32.lwe_dimension().to_lwe_size(),
+                ciphertext_modulus_ks32,
+            );
+            let mut output_ks_ct_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &stream);
+
+            let mut d_out_pbs_ct = CudaLweCiphertextList::new(
+                output_lwe_dimension,
+                LweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+
+            let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            let test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
+
+            let mut d_test_vector_indexes =
+                unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
+            unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
+
+            let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
+            let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
+            let lwe_indexes = lwe_indexes_usize
+                .iter()
+                .map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
+                .collect_vec();
+            let mut d_output_indexes =
+                unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+            let mut d_input_indexes =
+                unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
+            unsafe {
+                d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+                d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
+            }
+
+            cuda_keyswitch_lwe_ciphertext(
+                &d_ksk_big_to_small,
+                &d_lwe_ciphertext_in,
+                &mut output_ks_ct_gpu,
+                &d_input_indexes,
+                &d_output_indexes,
+                true,
+                &stream,
+                false,
+            );
+
+            cuda_programmable_bootstrap_lwe_ciphertext(
+                &output_ks_ct_gpu,
+                &mut d_out_pbs_ct,
+                &d_accumulator,
+                &d_test_vector_indexes,
+                &d_output_indexes,
+                &d_input_indexes,
+                &d_bsk,
+                &stream,
+            );
+
+            let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
+            assert!(check_encrypted_content_respects_mod(
+                &out_pbs_ct,
+                ciphertext_modulus
+            ));
+
+            let decrypted = decrypt_lwe_ciphertext(&big_lwe_sk_64, &out_pbs_ct);
+
+            let decoded = round_decode(decrypted.0, delta) % msg_modulus;
+
+            assert_eq!(decoded, f(msg));
+        }
+    }
+}
+
 fn lwe_encrypt_centered_ms_pbs_decrypt<
     Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize>,
 >(
@@ -327,6 +542,6 @@ fn lwe_encrypt_centered_ms_pbs_decrypt<
         }
     }
 }
-
+create_gpu_parameterized_test!(lwe_ks_encrypt_pbs_decrypt_compare_32_64);
 create_gpu_parameterized_test!(lwe_encrypt_pbs_decrypt);
 create_gpu_parameterized_test!(lwe_encrypt_centered_ms_pbs_decrypt);
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 9d8743cca..83bcde145 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -166,13 +166,13 @@ impl Drop for CudaStreams {
 /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
 /// required
 #[allow(clippy::too_many_arguments)]
-pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
+pub unsafe fn programmable_bootstrap_async<KST: UnsignedInteger, T: UnsignedInteger>(
     streams: &CudaStreams,
     lwe_array_out: &mut CudaVec<T>,
     lwe_out_indexes: &CudaVec<T>,
     test_vector: &CudaVec<T>,
     test_vector_indexes: &CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
+    lwe_array_in: &CudaVec<KST>,
     lwe_in_indexes: &CudaVec<T>,
     bootstrapping_key: &CudaVec<f64>,
     lwe_dimension: LweDimension,
@@ -193,45 +193,89 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
             PBSMSNoiseReductionType::Centered
         });
 
-    scratch_cuda_programmable_bootstrap_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        level.0 as u32,
-        num_samples,
-        true,
-        noise_reduction_type as u32,
-    );
+    if KST::BITS == 32 {
+        scratch_cuda_programmable_bootstrap_32_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+            noise_reduction_type as u32,
+        );
 
-    cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_out_indexes.as_c_ptr(0),
-        test_vector.as_c_ptr(0),
-        test_vector_indexes.as_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        lwe_in_indexes.as_c_ptr(0),
-        bootstrapping_key.as_c_ptr(0),
-        pbs_buffer,
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        base_log.0 as u32,
-        level.0 as u32,
-        num_samples,
-        num_many_lut,
-        lut_stride,
-    );
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
 
-    cleanup_cuda_programmable_bootstrap(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        std::ptr::addr_of_mut!(pbs_buffer),
-    );
+        cleanup_cuda_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else if KST::BITS == 64 {
+        scratch_cuda_programmable_bootstrap_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            level.0 as u32,
+            num_samples,
+            true,
+            noise_reduction_type as u32,
+        );
+
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            lwe_array_out.as_mut_c_ptr(0),
+            lwe_out_indexes.as_c_ptr(0),
+            test_vector.as_c_ptr(0),
+            test_vector_indexes.as_c_ptr(0),
+            lwe_array_in.as_c_ptr(0),
+            lwe_in_indexes.as_c_ptr(0),
+            bootstrapping_key.as_c_ptr(0),
+            pbs_buffer,
+            lwe_dimension.0 as u32,
+            glwe_dimension.0 as u32,
+            polynomial_size.0 as u32,
+            base_log.0 as u32,
+            level.0 as u32,
+            num_samples,
+            num_many_lut,
+            lut_stride,
+        );
+
+        cleanup_cuda_programmable_bootstrap(
+            streams.ptr[0],
+            streams.gpu_indexes[0].get(),
+            std::ptr::addr_of_mut!(pbs_buffer),
+        );
+    } else {
+        panic!("Unsupported PBS input bitwidth {}", KST::BITS);
+    }
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -348,6 +392,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
 #[allow(clippy::too_many_arguments)]
 pub unsafe fn programmable_bootstrap_multi_bit_async<
     T: UnsignedInteger,
+    KST: UnsignedInteger,
     B: Any + UnsignedInteger,
 >(
     streams: &CudaStreams,
@@ -355,7 +400,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
     output_indexes: &CudaVec<T>,
     test_vector: &CudaVec<B>,
     test_vector_indexes: &CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
+    lwe_array_in: &CudaVec<KST>,
     input_indexes: &CudaVec<T>,
     bootstrapping_key: &CudaVec<B>,
     lwe_dimension: LweDimension,
@@ -406,42 +451,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
             std::ptr::addr_of_mut!(pbs_buffer),
         );
     } else if TypeId::of::<B>() == TypeId::of::<u64>() {
-        scratch_cuda_multi_bit_programmable_bootstrap_64(
-            streams.ptr[0],
-            streams.gpu_indexes[0].get(),
-            std::ptr::addr_of_mut!(pbs_buffer),
-            glwe_dimension.0 as u32,
-            polynomial_size.0 as u32,
-            level.0 as u32,
-            num_samples,
-            true,
-        );
-        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-            streams.ptr[0],
-            streams.gpu_indexes[0].get(),
-            lwe_array_out.as_mut_c_ptr(0),
-            output_indexes.as_c_ptr(0),
-            test_vector.as_c_ptr(0),
-            test_vector_indexes.as_c_ptr(0),
-            lwe_array_in.as_c_ptr(0),
-            input_indexes.as_c_ptr(0),
-            bootstrapping_key.as_c_ptr(0),
-            pbs_buffer,
-            lwe_dimension.0 as u32,
-            glwe_dimension.0 as u32,
-            polynomial_size.0 as u32,
-            grouping_factor.0 as u32,
-            base_log.0 as u32,
-            level.0 as u32,
-            num_samples,
-            num_many_lut,
-            lut_stride,
-        );
-        cleanup_cuda_multi_bit_programmable_bootstrap(
-            streams.ptr[0],
-            streams.gpu_indexes[0].get(),
-            std::ptr::addr_of_mut!(pbs_buffer),
-        );
+        if TypeId::of::<KST>() == TypeId::of::<u32>() {
+            scratch_cuda_multi_bit_programmable_bootstrap_32_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                level.0 as u32,
+                num_samples,
+                true,
+            );
+            cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                lwe_array_out.as_mut_c_ptr(0),
+                output_indexes.as_c_ptr(0),
+                test_vector.as_c_ptr(0),
+                test_vector_indexes.as_c_ptr(0),
+                lwe_array_in.as_c_ptr(0),
+                input_indexes.as_c_ptr(0),
+                bootstrapping_key.as_c_ptr(0),
+                pbs_buffer,
+                lwe_dimension.0 as u32,
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+                base_log.0 as u32,
+                level.0 as u32,
+                num_samples,
+                num_many_lut,
+                lut_stride,
+            );
+            cleanup_cuda_multi_bit_programmable_bootstrap(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+            );
+        } else if TypeId::of::<KST>() == TypeId::of::<u64>() {
+            scratch_cuda_multi_bit_programmable_bootstrap_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                level.0 as u32,
+                num_samples,
+                true,
+            );
+            cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                lwe_array_out.as_mut_c_ptr(0),
+                output_indexes.as_c_ptr(0),
+                test_vector.as_c_ptr(0),
+                test_vector_indexes.as_c_ptr(0),
+                lwe_array_in.as_c_ptr(0),
+                input_indexes.as_c_ptr(0),
+                bootstrapping_key.as_c_ptr(0),
+                pbs_buffer,
+                lwe_dimension.0 as u32,
+                glwe_dimension.0 as u32,
+                polynomial_size.0 as u32,
+                grouping_factor.0 as u32,
+                base_log.0 as u32,
+                level.0 as u32,
+                num_samples,
+                num_many_lut,
+                lut_stride,
+            );
+            cleanup_cuda_multi_bit_programmable_bootstrap(
+                streams.ptr[0],
+                streams.gpu_indexes[0].get(),
+                std::ptr::addr_of_mut!(pbs_buffer),
+            );
+        } else {
+            panic!("Unsupported MB PBS input torus size");
+        }
     } else {
         panic!("Unsupported torus size")
     }