chore(gpu): internal renaming

2026-01-09 14:47:56 -05:00 · 2025-10-13 13:59:18 +02:00
parent 6347f25668
commit c3ed1a7558
70 changed files with 1920 additions and 1965 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -106,7 +106,7 @@ typedef struct {
  uint32_t polynomial_size;
 } CudaPackedGlweCiphertextListFFI;

-uint64_t scratch_cuda_apply_univariate_lut_kb_64(
+uint64_t scratch_cuda_apply_univariate_lut_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -114,7 +114,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
-uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
+uint64_t scratch_cuda_apply_many_univariate_lut_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -122,15 +122,16 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);
-void cuda_apply_univariate_lut_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
+                                  CudaRadixCiphertextFFI *output_radix_lwe,
+                                  CudaRadixCiphertextFFI const *input_radix_lwe,
+                                  int8_t *mem_ptr, void *const *ksks,
+                                  void *const *bsks);

-void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
-                                             int8_t **mem_ptr_void);
+void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void);

-uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
+uint64_t scratch_cuda_apply_bivariate_lut_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -139,17 +140,17 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_apply_bivariate_lut_kb_64(
+void cuda_apply_bivariate_lut_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe_1,
    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
    uint32_t shift);

-void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
-                                            int8_t **mem_ptr_void);
+void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
+                                         int8_t **mem_ptr_void);

-void cuda_apply_many_univariate_lut_kb_64(
+void cuda_apply_many_univariate_lut_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks, uint32_t num_luts,
@@ -171,7 +172,7 @@ void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
 void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
                                   int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
    bool const is_boolean_right, uint32_t message_modulus,
    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
@@ -180,7 +181,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    uint32_t num_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_mult_radix_ciphertext_kb_64(
+void cuda_integer_mult_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
@@ -189,17 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64(

 void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void);

-void cuda_negate_integer_radix_ciphertext_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t num_radix_blocks);
+void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
+                               CudaRadixCiphertextFFI *lwe_array_out,
+                               CudaRadixCiphertextFFI const *lwe_array_in,
+                               uint32_t message_modulus, uint32_t carry_modulus,
+                               uint32_t num_radix_blocks);

-void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
+void cuda_scalar_addition_ciphertext_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
    uint32_t message_modulus, uint32_t carry_modulus);

-uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
+uint64_t scratch_cuda_logical_scalar_shift_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -208,11 +210,12 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
+                                          CudaRadixCiphertextFFI *lwe_array,
+                                          uint32_t shift, int8_t *mem_ptr,
+                                          void *const *bsks, void *const *ksks);

-uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
+uint64_t scratch_cuda_arithmetic_scalar_shift_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -221,17 +224,19 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
+                                             CudaRadixCiphertextFFI *lwe_array,
+                                             uint32_t shift, int8_t *mem_ptr,
+                                             void *const *bsks,
+                                             void *const *ksks);

-void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
-                                                     int8_t **mem_ptr_void);
+void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
+                                       int8_t **mem_ptr_void);

-void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
-                                                        int8_t **mem_ptr_void);
+void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
+uint64_t scratch_cuda_shift_and_rotate_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -240,15 +245,16 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
-    CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *lwe_array,
+                                      CudaRadixCiphertextFFI const *lwe_shift,
+                                      int8_t *mem_ptr, void *const *bsks,
+                                      void *const *ksks);

-void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
-                                                 int8_t **mem_ptr_void);
+void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
+                                   int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_radix_comparison_kb_64(
+uint64_t scratch_cuda_comparison_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -258,13 +264,14 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
    bool is_signed, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_comparison_integer_radix_ciphertext_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *lwe_array_out,
+                                   CudaRadixCiphertextFFI const *lwe_array_1,
+                                   CudaRadixCiphertextFFI const *lwe_array_2,
+                                   int8_t *mem_ptr, void *const *bsks,
+                                   void *const *ksks);

-void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
+void cuda_scalar_comparison_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
    void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
@@ -273,7 +280,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
                                     int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_radix_bitop_kb_64(
+uint64_t scratch_cuda_bitop_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -282,13 +289,14 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_bitop_integer_radix_ciphertext_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
+                              CudaRadixCiphertextFFI *lwe_array_out,
+                              CudaRadixCiphertextFFI const *lwe_array_1,
+                              CudaRadixCiphertextFFI const *lwe_array_2,
+                              int8_t *mem_ptr, void *const *bsks,
+                              void *const *ksks);

-void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
+void cuda_scalar_bitop_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
@@ -296,26 +304,28 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(

 void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_radix_cmux_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
+                              uint32_t glwe_dimension, uint32_t polynomial_size,
+                              uint32_t big_lwe_dimension,
+                              uint32_t small_lwe_dimension, uint32_t ks_level,
+                              uint32_t ks_base_log, uint32_t pbs_level,
+                              uint32_t pbs_base_log, uint32_t grouping_factor,
+                              uint32_t lwe_ciphertext_count,
+                              uint32_t message_modulus, uint32_t carry_modulus,
+                              PBS_TYPE pbs_type, bool allocate_gpu_memory,
+                              PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_cmux_integer_radix_ciphertext_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
+                             CudaRadixCiphertextFFI *lwe_array_out,
+                             CudaRadixCiphertextFFI const *lwe_condition,
+                             CudaRadixCiphertextFFI const *lwe_array_true,
+                             CudaRadixCiphertextFFI const *lwe_array_false,
+                             int8_t *mem_ptr, void *const *bsks,
+                             void *const *ksks);

-void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
-                                     int8_t **mem_ptr_void);
+void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
+uint64_t scratch_cuda_scalar_rotate_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -324,14 +334,14 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_radix_scalar_rotate_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *lwe_array,
+                                   uint32_t n, int8_t *mem_ptr,
+                                   void *const *bsks, void *const *ksks);

-void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
-                                              int8_t **mem_ptr_void);
+void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void);

-uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -340,7 +350,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -349,13 +359,13 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_propagate_single_carry_kb_64_inplace(
+void cuda_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    uint32_t requested_flag, uint32_t uses_carry);

-void cuda_add_and_propagate_single_carry_kb_64_inplace(
+void cuda_add_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
@@ -367,7 +377,7 @@ void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
 void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -376,7 +386,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
    PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_overflowing_sub_kb_64_inplace(
+void cuda_integer_overflowing_sub_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array,
    CudaRadixCiphertextFFI *overflow_block,
@@ -387,7 +397,7 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
 void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -397,15 +407,16 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
+                                         CudaRadixCiphertextFFI *radix_lwe_out,
+                                         CudaRadixCiphertextFFI *radix_lwe_vec,
+                                         int8_t *mem_ptr, void *const *bsks,
+                                         void *const *ksks);

-void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void);
+void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_scalar_mul_kb_64(
+uint64_t scratch_cuda_integer_scalar_mul_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -413,16 +424,15 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
+void cuda_scalar_multiplication_ciphertext_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);

-void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void);
+void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
    CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -431,7 +441,7 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_div_rem_radix_ciphertext_kb_64(
+void cuda_integer_div_rem_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
@@ -460,7 +470,7 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
 void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
                                            CudaRadixCiphertextFFI *lwe_array);

-uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -469,14 +479,14 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+void cuda_integer_abs_inplace_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
    bool is_signed, void *const *bsks, void *const *ksks);

 void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -485,7 +495,7 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_are_all_comparisons_block_true_kb_64(
+void cuda_integer_are_all_comparisons_block_true_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
@@ -493,7 +503,7 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
 void cleanup_cuda_integer_are_all_comparisons_block_true(CudaStreamsFFI streams,
                                                         int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -502,7 +512,7 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+void cuda_integer_is_at_least_one_comparisons_block_true_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
@@ -518,7 +528,7 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
                              CudaRadixCiphertextFFI const *input,
                              CudaStreamsFFI streams);

-uint64_t scratch_cuda_apply_noise_squashing_kb(
+uint64_t scratch_cuda_apply_noise_squashing(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
@@ -528,15 +538,16 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_apply_noise_squashing_kb(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+void cuda_apply_noise_squashing(CudaStreamsFFI streams,
+                                CudaRadixCiphertextFFI *output_radix_lwe,
+                                CudaRadixCiphertextFFI const *input_radix_lwe,
+                                int8_t *mem_ptr, void *const *ksks,
+                                void *const *bsks);

-void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void);
+void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
+                                        int8_t **mem_ptr_void);

-uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -545,7 +556,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
    PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_sub_and_propagate_single_carry_kb_64_inplace(
+void cuda_sub_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
@@ -554,7 +565,7 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
 void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
+uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -563,13 +574,13 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_unsigned_scalar_div_radix_kb_64(
+void cuda_integer_unsigned_scalar_div_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    const CudaScalarDivisorFFI *scalar_divisor_ffi);

-void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void);
+void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
+                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
@@ -590,7 +601,7 @@ void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
 void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
                                                int8_t **mem_ptr_void);

-uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -599,15 +610,15 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
    const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_signed_scalar_div_radix_kb_64(
+void cuda_integer_signed_scalar_div_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);

-void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
-                                                        int8_t **mem_ptr_void);
+void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
+                                                     int8_t **mem_ptr_void);

-uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -617,7 +628,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+void cuda_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -626,10 +637,10 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
    void const *clear_blocks, void const *h_clear_blocks,
    uint32_t num_clear_blocks);

-void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);

-uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
+uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -639,7 +650,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_signed_scalar_div_rem_radix_kb_64(
+void cuda_integer_signed_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -647,10 +658,10 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    uint32_t numerator_bits);

-void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void);
+void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(CudaStreamsFFI streams,
+                                                         int8_t **mem_ptr_void);

-uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
+uint64_t scratch_integer_count_of_consecutive_bits_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -659,13 +670,13 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
    Direction direction, BitValue bit_value, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_count_of_consecutive_bits_kb_64(
+void cuda_integer_count_of_consecutive_bits_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks);

-void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void);
+void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
+                                                       int8_t **mem_ptr_void);

 uint64_t scratch_cuda_integer_grouped_oprf_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
@@ -676,16 +687,16 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
    bool allocate_gpu_memory, uint32_t message_bits_per_block,
    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
-                                        CudaRadixCiphertextFFI *radix_lwe_out,
-                                        const void *seeded_lwe_input,
-                                        uint32_t num_blocks_to_process,
-                                        int8_t *mem, void *const *bsks);
+void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
+                                  CudaRadixCiphertextFFI *radix_lwe_out,
+                                  const void *seeded_lwe_input,
+                                  uint32_t num_blocks_to_process, int8_t *mem,
+                                  void *const *bsks);

 void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);

-uint64_t scratch_integer_ilog2_kb_64(
+uint64_t scratch_integer_ilog2_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -694,7 +705,7 @@ uint64_t scratch_integer_ilog2_kb_64(
    uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_ilog2_kb_64(
+void cuda_integer_ilog2_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct,
    CudaRadixCiphertextFFI const *trivial_ct_neg_n,
@@ -702,8 +713,8 @@ void cuda_integer_ilog2_kb_64(
    CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks);

-void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
-                                      int8_t **mem_ptr_void);
+void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
+                                   int8_t **mem_ptr_void);
 } // extern C

 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
@@ -111,9 +111,9 @@ aes_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI *data,
                  int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
                  Torus *const *ksks) {

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, data, data, bsks, ksks, mem->luts->flush_lut,
-      data->num_radix_blocks);
+  integer_radix_apply_univariate_lookup_table<Torus>(streams, data, data, bsks,
+                                                     ksks, mem->luts->flush_lut,
+                                                     data->num_radix_blocks);
 }

 /**
@@ -126,8 +126,8 @@ __host__ __forceinline__ void aes_scalar_add_one_flush_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *data,
    int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {

-  host_integer_radix_add_scalar_one_inplace<Torus>(
-      streams, data, mem->params.message_modulus, mem->params.carry_modulus);
+  host_add_scalar_one_inplace<Torus>(streams, data, mem->params.message_modulus,
+                                     mem->params.carry_modulus);

  aes_flush_inplace(streams, data, mem, bsks, ksks);
 }
@@ -167,7 +167,7 @@ batch_vec_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI **targets,
                                       &dest_slice, targets[i]);
  }

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, &batch_out, &batch_in, bsks, ksks, mem->luts->flush_lut,
      batch_out.num_radix_blocks);

@@ -220,7 +220,7 @@ __host__ void batch_vec_and_inplace(CudaStreams streams,
                                       &dest_rhs_slice, rhs[i]);
  }

-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table<Torus>(
      streams, &batch_out, &batch_lhs, &batch_rhs, bsks, ksks,
      mem->luts->and_lut, batch_out.num_radix_blocks,
      mem->params.message_modulus);
@@ -358,9 +358,9 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,

 #define ADD_ONE(target)                                                        \
  do {                                                                         \
-    host_integer_radix_add_scalar_one_inplace<Torus>(                          \
-        streams, target, mem->params.message_modulus,                          \
-        mem->params.carry_modulus);                                            \
+    host_add_scalar_one_inplace<Torus>(streams, target,                        \
+                                       mem->params.message_modulus,            \
+                                       mem->params.carry_modulus);             \
  } while (0)

  // Homomorphic S-Box Circuit Evaluation
@@ -1057,7 +1057,7 @@ __host__ void vectorized_aes_full_adder_inplace(
    // The carry_lut applies the function f(x) = (x >> 1) & 1, which
    // extracts the carry bit from the previous sum. The result is stored
    // in carry_vec for the next iteration (i+1).
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, carry_vec, sum_plus_carry_vec, bsks, ksks,
        mem->luts->carry_lut, num_aes_inputs);

@@ -1065,7 +1065,7 @@ __host__ void vectorized_aes_full_adder_inplace(
    // The flush_lut applies the function f(x) = x & 1, which extracts
    // the least significant bit of the sum. The result is written
    // directly into the state buffer, updating the IV in-place.
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, &a_i_vec, sum_plus_carry_vec, bsks, ksks, mem->luts->flush_lut,
        num_aes_inputs);
  }
@@ -1221,9 +1221,9 @@ __host__ void host_integer_key_expansion(CudaStreams streams,
          CudaRadixCiphertextFFI first_byte_bit_slice;
          as_radix_ciphertext_slice<Torus>(&first_byte_bit_slice,
                                           &rotated_word_buffer, bit, bit + 1);
-          host_integer_radix_add_scalar_one_inplace<Torus>(
-              streams, &first_byte_bit_slice, mem->params.message_modulus,
-              mem->params.carry_modulus);
+          host_add_scalar_one_inplace<Torus>(streams, &first_byte_bit_slice,
+                                             mem->params.message_modulus,
+                                             mem->params.carry_modulus);
        }
      }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -1,6 +1,6 @@
 #include "integer/abs.cuh"

-uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -14,19 +14,19 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_abs_kb<uint64_t>(
+  return scratch_cuda_integer_abs<uint64_t>(
      CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
      num_blocks, params, allocate_gpu_memory);
 }

-void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+void cuda_integer_abs_inplace_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
    bool is_signed, void *const *bsks, void *const *ksks) {

  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

-  host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
-                                (uint64_t **)(ksks), mem, is_signed);
+  host_integer_abs<uint64_t>(CudaStreams(streams), ct, bsks,
+                             (uint64_t **)(ksks), mem, is_signed);
 }

 void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -10,9 +10,11 @@
 #include "radix_ciphertext.cuh"

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_abs_kb(
-    CudaStreams streams, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
-    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
+__host__ uint64_t scratch_cuda_integer_abs(CudaStreams streams,
+                                           int_abs_buffer<Torus> **mem_ptr,
+                                           bool is_signed, uint32_t num_blocks,
+                                           int_radix_params params,
+                                           bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  if (is_signed) {
@@ -23,10 +25,10 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
 }

 template <typename Torus>
-__host__ void
-host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
-                    void *const *bsks, uint64_t *const *ksks,
-                    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
+__host__ void host_integer_abs(CudaStreams streams, CudaRadixCiphertextFFI *ct,
+                               void *const *bsks, uint64_t *const *ksks,
+                               int_abs_buffer<uint64_t> *mem_ptr,
+                               bool is_signed) {
  if (!is_signed)
    return;

@@ -39,7 +41,7 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                     mask, ct);

-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+  host_arithmetic_scalar_shift_inplace<Torus>(
      streams, mask, num_bits_in_ciphertext - 1,
      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
@@ -52,8 +54,7 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
                                     mem_ptr->scp_mem, bsks, ksks,
                                     requested_flag, uses_carry);

-  host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
-                                     bsks, ksks);
+  host_bitop<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem, bsks, ksks);
 }

 #endif // TFHE_RS_ABS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,6 +1,6 @@
 #include "integer/bitwise_ops.cuh"

-uint64_t scratch_cuda_integer_radix_bitop_kb_64(
+uint64_t scratch_cuda_bitop_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,20 +14,21 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
+  return scratch_cuda_bitop<uint64_t>(
      CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
      lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
 }

-void cuda_bitop_integer_radix_ciphertext_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
+                              CudaRadixCiphertextFFI *lwe_array_out,
+                              CudaRadixCiphertextFFI const *lwe_array_1,
+                              CudaRadixCiphertextFFI const *lwe_array_2,
+                              int8_t *mem_ptr, void *const *bsks,
+                              void *const *ksks) {

-  host_integer_radix_bitop_kb<uint64_t>(
-      CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+  host_bitop<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
+                       lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
+                       (uint64_t **)(ksks));
 }

 void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -10,11 +10,12 @@
 #include "pbs/programmable_bootstrap_multibit.cuh"

 template <typename Torus>
-__host__ void host_integer_radix_bitop_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+__host__ void host_bitop(CudaStreams streams,
+                         CudaRadixCiphertextFFI *lwe_array_out,
+                         CudaRadixCiphertextFFI const *lwe_array_1,
+                         CudaRadixCiphertextFFI const *lwe_array_2,
+                         int_bitop_buffer<Torus> *mem_ptr, void *const *bsks,
+                         Torus *const *ksks) {

  PANIC_IF_FALSE(
      lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
@@ -41,7 +42,7 @@ __host__ void host_integer_radix_bitop_kb(
                                lwe_array_1->num_radix_blocks);
  }

-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table<Torus>(
      streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, lut,
      lwe_array_out->num_radix_blocks, lut->params.message_modulus);

@@ -50,10 +51,11 @@ __host__ void host_integer_radix_bitop_kb(
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
-    CudaStreams streams, int_bitop_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
-    bool allocate_gpu_memory) {
+__host__ uint64_t scratch_cuda_bitop(CudaStreams streams,
+                                     int_bitop_buffer<Torus> **mem_ptr,
+                                     uint32_t num_radix_blocks,
+                                     int_radix_params params, BITOP_TYPE op,
+                                     bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -78,8 +78,8 @@ __host__ void host_extend_radix_with_sign_msb(
      streams.stream(0), streams.gpu_index(0), mem_ptr->last_block, 0, 1, input,
      input_blocks - 1, input_blocks);

-  host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
-                               mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
+  host_apply_univariate_lut(streams, mem_ptr->padding_block,
+                            mem_ptr->last_block, mem_ptr->lut, ksks, bsks);

  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
    uint32_t dst_block_idx = input_blocks + i;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,13 +1,15 @@
 #include "integer/cmux.cuh"

-uint64_t scratch_cuda_integer_radix_cmux_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
+                              uint32_t glwe_dimension, uint32_t polynomial_size,
+                              uint32_t big_lwe_dimension,
+                              uint32_t small_lwe_dimension, uint32_t ks_level,
+                              uint32_t ks_base_log, uint32_t pbs_level,
+                              uint32_t pbs_base_log, uint32_t grouping_factor,
+                              uint32_t lwe_ciphertext_count,
+                              uint32_t message_modulus, uint32_t carry_modulus,
+                              PBS_TYPE pbs_type, bool allocate_gpu_memory,
+                              PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch cmux")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -17,29 +19,29 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
+  uint64_t ret = scratch_cuda_cmux<uint64_t>(
      CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
      predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
  POP_RANGE()
  return ret;
 }

-void cuda_cmux_integer_radix_ciphertext_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
+                             CudaRadixCiphertextFFI *lwe_array_out,
+                             CudaRadixCiphertextFFI const *lwe_condition,
+                             CudaRadixCiphertextFFI const *lwe_array_true,
+                             CudaRadixCiphertextFFI const *lwe_array_false,
+                             int8_t *mem_ptr, void *const *bsks,
+                             void *const *ksks) {
  PUSH_RANGE("cmux")
-  host_integer_radix_cmux_kb<uint64_t>(
-      CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
-      lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+  host_cmux<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_condition,
+                      lwe_array_true, lwe_array_false,
+                      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
+                      (uint64_t **)(ksks));
  POP_RANGE()
 }

-void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
-                                     int8_t **mem_ptr_void) {
+void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup cmux")
  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -28,7 +28,7 @@ __host__ void zero_out_if(CudaStreams streams,
  cuda_set_device(streams.gpu_index(0));
  auto params = mem_ptr->params;

-  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
+  // We can't use integer_radix_apply_bivariate_lookup_table since the
  // second operand is not an array
  auto tmp_lwe_array_input = mem_ptr->tmp;
  host_pack_bivariate_blocks_with_single_block<Torus>(
@@ -36,18 +36,19 @@ __host__ void zero_out_if(CudaStreams streams,
      lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
      num_radix_blocks);

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks, predicate,
      num_radix_blocks);
 }

 template <typename Torus>
-__host__ void host_integer_radix_cmux_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false,
-    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
+__host__ void host_cmux(CudaStreams streams,
+                        CudaRadixCiphertextFFI *lwe_array_out,
+                        CudaRadixCiphertextFFI const *lwe_condition,
+                        CudaRadixCiphertextFFI const *lwe_array_true,
+                        CudaRadixCiphertextFFI const *lwe_array_false,
+                        int_cmux_buffer<Torus> *mem_ptr, void *const *bsks,
+                        Torus *const *ksks) {

  if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -69,7 +70,7 @@ __host__ void host_integer_radix_cmux_kb(
        streams.stream(0), streams.gpu_index(0), mem_ptr->condition_array, i,
        i + 1, lwe_condition, 0, 1);
  }
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table<Torus>(
      streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
      mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
      2 * num_radix_blocks, params.message_modulus);
@@ -88,16 +89,18 @@ __host__ void host_integer_radix_cmux_kb(
                       &mem_true, &mem_false, num_radix_blocks,
                       params.message_modulus, params.carry_modulus);

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, &mem_true, bsks, ksks,
      mem_ptr->message_extract_lut, num_radix_blocks);
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
-    CudaStreams streams, int_cmux_buffer<Torus> **mem_ptr,
-    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
-    int_radix_params params, bool allocate_gpu_memory) {
+__host__ uint64_t scratch_cuda_cmux(CudaStreams streams,
+                                    int_cmux_buffer<Torus> **mem_ptr,
+                                    std::function<Torus(Torus)> predicate_lut_f,
+                                    uint32_t num_radix_blocks,
+                                    int_radix_params params,
+                                    bool allocate_gpu_memory) {
  uint64_t size_tracker = 0;
  *mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
                                        num_radix_blocks, allocate_gpu_memory,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,6 +1,6 @@
 #include "integer/comparison.cuh"

-uint64_t scratch_cuda_integer_radix_comparison_kb_64(
+uint64_t scratch_cuda_comparison_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -18,7 +18,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
  switch (op_type) {
  case EQ:
  case NE:
-    size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+    size_tracker += scratch_cuda_comparison_check<uint64_t>(
        CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
        num_radix_blocks, params, op_type, false, allocate_gpu_memory);
    break;
@@ -28,7 +28,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
  case LE:
  case MAX:
  case MIN:
-    size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+    size_tracker += scratch_cuda_comparison_check<uint64_t>(
        CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
        num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
    break;
@@ -37,11 +37,12 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
  return size_tracker;
 }

-void cuda_comparison_integer_radix_ciphertext_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *lwe_array_out,
+                                   CudaRadixCiphertextFFI const *lwe_array_1,
+                                   CudaRadixCiphertextFFI const *lwe_array_2,
+                                   int8_t *mem_ptr, void *const *bsks,
+                                   void *const *ksks) {
  PUSH_RANGE("comparison")
  if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
    PANIC("Cuda error: input num radix blocks must be the same")
@@ -54,9 +55,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  switch (buffer->op) {
  case EQ:
  case NE:
-    host_integer_radix_equality_check_kb<uint64_t>(
-        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
-        bsks, (uint64_t **)(ksks), num_radix_blocks);
+    host_equality_check<uint64_t>(CudaStreams(streams), lwe_array_out,
+                                  lwe_array_1, lwe_array_2, buffer, bsks,
+                                  (uint64_t **)(ksks), num_radix_blocks);
    break;
  case GT:
  case GE:
@@ -65,18 +66,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    if (num_radix_blocks % 2 != 0)
      PANIC("Cuda error (comparisons): the number of radix blocks has to be "
            "even.")
-    host_integer_radix_difference_check_kb<uint64_t>(
-        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
-        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
-        num_radix_blocks);
+    host_difference_check<uint64_t>(CudaStreams(streams), lwe_array_out,
+                                    lwe_array_1, lwe_array_2, buffer,
+                                    buffer->diff_buffer->operator_f, bsks,
+                                    (uint64_t **)(ksks), num_radix_blocks);
    break;
  case MAX:
  case MIN:
    if (num_radix_blocks % 2 != 0)
      PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
-    host_integer_radix_maxmin_kb<uint64_t>(
-        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
-        bsks, (uint64_t **)(ksks), num_radix_blocks);
+    host_maxmin<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
+                          lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
+                          num_radix_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
@@ -95,7 +96,7 @@ void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
  POP_RANGE()
 }

-uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -109,12 +110,12 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+  return scratch_cuda_comparison_check<uint64_t>(
      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
      num_radix_blocks, params, EQ, false, allocate_gpu_memory);
 }

-void cuda_integer_are_all_comparisons_block_true_kb_64(
+void cuda_integer_are_all_comparisons_block_true_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
@@ -122,7 +123,7 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;

-  host_integer_are_all_comparisons_block_true_kb<uint64_t>(
+  host_integer_are_all_comparisons_block_true<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
      (uint64_t **)(ksks), num_radix_blocks);
 }
@@ -137,7 +138,7 @@ void cleanup_cuda_integer_are_all_comparisons_block_true(
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -151,12 +152,12 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+  return scratch_cuda_comparison_check<uint64_t>(
      CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
      num_radix_blocks, params, EQ, false, allocate_gpu_memory);
 }

-void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+void cuda_integer_is_at_least_one_comparisons_block_true_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
@@ -164,7 +165,7 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;

-  host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
+  host_integer_is_at_least_one_comparisons_block_true<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
      (uint64_t **)(ksks), num_radix_blocks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -155,7 +155,7 @@ __host__ void are_all_comparisons_block_true(
    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
      // Reset max_value_lut_indexes before returning, otherwise if the lut is
      // reused the lut indexes will be wrong
@@ -172,7 +172,7 @@ __host__ void are_all_comparisons_block_true(
      reset_radix_ciphertext_blocks(lwe_array_out, 1);
      return;
    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, tmp_out, accumulator, bsks, ksks, lut, num_chunks);
    }
  }
@@ -241,12 +241,12 @@ __host__ void is_at_least_one_comparisons_block_true(
    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
          lut, 1);
      return;
    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
          bsks, ksks, lut, num_chunks);
    }
@@ -314,19 +314,19 @@ __host__ void host_compare_blocks_with_zero(
    }
  }

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, sum, bsks, ksks, zero_comparison, num_sum_blocks);

  reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
 }

 template <typename Torus>
-__host__ void host_integer_radix_equality_check_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+host_equality_check(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+                    CudaRadixCiphertextFFI const *lwe_array_1,
+                    CudaRadixCiphertextFFI const *lwe_array_2,
+                    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
+                    Torus *const *ksks, uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -335,7 +335,7 @@ __host__ void host_integer_radix_equality_check_kb(

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table<Torus>(
      streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
      eq_buffer->operator_lut, num_radix_blocks,
      eq_buffer->operator_lut->params.message_modulus);
@@ -349,12 +349,12 @@ __host__ void host_integer_radix_equality_check_kb(
 }

 template <typename Torus>
-__host__ void compare_radix_blocks_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_left,
-    CudaRadixCiphertextFFI const *lwe_array_right,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+compare_radix_blocks(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+                     CudaRadixCiphertextFFI const *lwe_array_left,
+                     CudaRadixCiphertextFFI const *lwe_array_right,
+                     int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
+                     Torus *const *ksks, uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -386,15 +386,15 @@ __host__ void compare_radix_blocks_kb(

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, lwe_array_out, bsks, ksks, is_non_zero_lut,
      num_radix_blocks);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace<Torus>(
-      streams, lwe_array_out, message_modulus, carry_modulus);
+  host_add_scalar_one_inplace<Torus>(streams, lwe_array_out, message_modulus,
+                                     carry_modulus);
 }

 // Reduces a vec containing shortint blocks that encrypts a sign
@@ -439,7 +439,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
                       partial_block_count, message_modulus);

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, x, y, bsks, ksks, inner_tree_leaf, partial_block_count >> 1);

    if ((partial_block_count % 2) != 0) {
@@ -485,12 +485,12 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
  last_lut->broadcast_lut(active_streams);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, y, bsks, ksks, last_lut, 1);
+  integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,
+                                                     bsks, ksks, last_lut, 1);
 }

 template <typename Torus>
-__host__ void host_integer_radix_difference_check_kb(
+__host__ void host_difference_check(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_left,
    CudaRadixCiphertextFFI const *lwe_array_right,
@@ -534,7 +534,7 @@ __host__ void host_integer_radix_difference_check_kb(

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
        identity_lut, 2 * packed_num_radix_blocks);
  } else {
@@ -553,15 +553,15 @@ __host__ void host_integer_radix_difference_check_kb(
  if (!mem_ptr->is_signed) {
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
-    compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
-                                   bsks, ksks, packed_num_radix_blocks);
+    compare_radix_blocks<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr, bsks,
+                                ksks, packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
-                                     bsks, ksks, packed_num_radix_blocks);
+      compare_radix_blocks<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
+                                  bsks, ksks, packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -573,7 +573,7 @@ __host__ void host_integer_radix_difference_check_kb(
      as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_left, lwe_array_left,
                                       num_radix_blocks - 2,
                                       num_radix_blocks - 1);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
          bsks, ksks, identity_lut, 1);

@@ -586,7 +586,7 @@ __host__ void host_integer_radix_difference_check_kb(
      as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_right,
                                       lwe_array_right, num_radix_blocks - 2,
                                       num_radix_blocks - 1);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, &last_right_block_before_sign_block,
          &shifted_lwe_array_right, bsks, ksks, identity_lut, 1);

@@ -594,7 +594,7 @@ __host__ void host_integer_radix_difference_check_kb(
      as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
                                       packed_num_radix_blocks,
                                       packed_num_radix_blocks + 1);
-      compare_radix_blocks_kb<Torus>(
+      compare_radix_blocks<Torus>(
          streams, &shifted_comparisons, &last_left_block_before_sign_block,
          &last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);

@@ -608,16 +608,16 @@ __host__ void host_integer_radix_difference_check_kb(
      CudaRadixCiphertextFFI last_right_block;
      as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
                                       num_radix_blocks - 1, num_radix_blocks);
-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table<Torus>(
          streams, &shifted_comparisons, &last_left_block, &last_right_block,
          bsks, ksks, mem_ptr->signed_lut, 1,
          mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb<Torus>(streams, comparisons, lwe_array_left,
-                                     lwe_array_right, mem_ptr, bsks, ksks,
-                                     num_radix_blocks - 1);
+      compare_radix_blocks<Torus>(streams, comparisons, lwe_array_left,
+                                  lwe_array_right, mem_ptr, bsks, ksks,
+                                  num_radix_blocks - 1);
      // Compare the sign block separately
      CudaRadixCiphertextFFI shifted_comparisons;
      as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -628,7 +628,7 @@ __host__ void host_integer_radix_difference_check_kb(
      CudaRadixCiphertextFFI last_right_block;
      as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
                                       num_radix_blocks - 1, num_radix_blocks);
-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table<Torus>(
          streams, &shifted_comparisons, &last_left_block, &last_right_block,
          bsks, ksks, mem_ptr->signed_lut, 1,
          mem_ptr->signed_lut->params.message_modulus);
@@ -645,7 +645,7 @@ __host__ void host_integer_radix_difference_check_kb(
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
+__host__ uint64_t scratch_cuda_comparison_check(
    CudaStreams streams, int_comparison_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
    bool is_signed, bool allocate_gpu_memory) {
@@ -658,12 +658,12 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_maxmin_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_left,
-    CudaRadixCiphertextFFI const *lwe_array_right,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+host_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+            CudaRadixCiphertextFFI const *lwe_array_left,
+            CudaRadixCiphertextFFI const *lwe_array_right,
+            int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
+            Torus *const *ksks, uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -675,18 +675,18 @@ __host__ void host_integer_radix_maxmin_kb(
          "than the number of blocks to operate on")

  // Compute the sign
-  host_integer_radix_difference_check_kb<Torus>(
+  host_difference_check<Torus>(
      streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb<Torus>(
-      streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
+  host_cmux<Torus>(streams, lwe_array_out, mem_ptr->tmp_lwe_array_out,
+                   lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                   ksks);
 }

 template <typename Torus>
-__host__ void host_integer_are_all_comparisons_block_true_kb(
+__host__ void host_integer_are_all_comparisons_block_true(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -699,7 +699,7 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
 }

 template <typename Torus>
-__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
+__host__ void host_integer_is_at_least_one_comparisons_block_true(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -13,7 +13,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
      carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);

-  return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
+  return scratch_cuda_compress_ciphertext<uint64_t>(
      CudaStreams(streams), (int_compression<uint64_t> **)mem_ptr,
      num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
 }
@@ -93,7 +93,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
      carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);

-  return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
+  return scratch_cuda_compress_ciphertext<__uint128_t>(
      CudaStreams(streams), (int_compression<__uint128_t> **)mem_ptr,
      num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -401,7 +401,7 @@ host_integer_decompress(CudaStreams streams,
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
+__host__ uint64_t scratch_cuda_compress_ciphertext(
    CudaStreams streams, int_compression<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params compression_params,
    uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -1,6 +1,6 @@
 #include "integer/div_rem.cuh"

-uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
    CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -14,13 +14,13 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_div_rem_kb<uint64_t>(
+  return scratch_cuda_integer_div_rem<uint64_t>(
      CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
      num_blocks, params, allocate_gpu_memory);
  POP_RANGE()
 }

-void cuda_integer_div_rem_radix_ciphertext_kb_64(
+void cuda_integer_div_rem_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
@@ -28,9 +28,9 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
  PUSH_RANGE("div")
  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

-  host_integer_div_rem_kb<uint64_t>(CudaStreams(streams), quotient, remainder,
-                                    numerator, divisor, is_signed, bsks,
-                                    (uint64_t **)(ksks), mem);
+  host_integer_div_rem<uint64_t>(CudaStreams(streams), quotient, remainder,
+                                 numerator, divisor, is_signed, bsks,
+                                 (uint64_t **)(ksks), mem);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -14,7 +14,7 @@
 #include <fstream>

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_div_rem_kb(
+__host__ uint64_t scratch_cuda_integer_div_rem(
    CudaStreams streams, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {

@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
 }

 template <typename Torus>
-__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
+__host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, void *const *bsks,
@@ -85,7 +85,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
  // Computes 2*d by extending and shifting on gpu[1]
  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
      mem_ptr->d2, divisor_gpu_1, streams.get_ith(1));
-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+  host_logical_scalar_shift_inplace<Torus>(
      streams.get_ith(1), mem_ptr->d2, 1, mem_ptr->shift_mem, &bsks[1],
      &ksks[1], mem_ptr->d2->num_radix_blocks);

@@ -250,14 +250,14 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
    auto o3 = mem_ptr->sub_1_overflowed;

    // used as a bitor
-    host_integer_radix_bitop_kb(streams.get_ith(0), o3, o3, mem_ptr->cmp_1,
-                                mem_ptr->bitor_mem_1, &bsks[0], &ksks[0]);
+    host_bitop(streams.get_ith(0), o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
+               &bsks[0], &ksks[0]);
    // used as a bitor
-    host_integer_radix_bitop_kb(streams.get_ith(1), o2, o2, mem_ptr->cmp_2,
-                                mem_ptr->bitor_mem_2, &bsks[1], &ksks[1]);
+    host_bitop(streams.get_ith(1), o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
+               &bsks[1], &ksks[1]);
    // used as a bitor
-    host_integer_radix_bitop_kb(streams.get_ith(2), o1, o1, mem_ptr->cmp_3,
-                                mem_ptr->bitor_mem_3, &bsks[2], &ksks[2]);
+    host_bitop(streams.get_ith(2), o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
+               &bsks[2], &ksks[2]);

    // cmp_1, cmp_2, cmp_3 are not needed anymore, we can reuse them as c3,
    // c2, c1. c0 is allocated on gpu[3], we take it from mem_ptr.
@@ -337,7 +337,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
      host_add_the_same_block_to_all_blocks<Torus>(streams.stream(gpu_index),
                                                   streams.gpu_index(gpu_index),
                                                   rx, rx, cx, 4, 4);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams.get_ith(gpu_index), rx, rx, &bsks[gpu_index],
          &ksks[gpu_index], lut, rx->num_radix_blocks);
    };
@@ -355,15 +355,15 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
                       mem_ptr->zero_out_if_not_1_lut_2, 2);

    // calculate quotient bits GPU[2]
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        mem_ptr->sub_streams_1.get_ith(2), mem_ptr->q1, c1, &bsks[2], &ksks[2],
        mem_ptr->quotient_lut_1, 1);
    // calculate quotient bits GPU[1]
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        mem_ptr->sub_streams_1.get_ith(1), mem_ptr->q2, c2, &bsks[1], &ksks[1],
        mem_ptr->quotient_lut_2, 1);
    // calculate quotient bits GPU[0]
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        mem_ptr->sub_streams_1.get_ith(0), mem_ptr->q3, c3, &bsks[0], &ksks[0],
        mem_ptr->quotient_lut_3, 1);

@@ -427,10 +427,10 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(

    streams.synchronize();

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, rem_gpu_0, rem_gpu_0, bsks, ksks,
        mem_ptr->message_extract_lut_1, rem_gpu_0->num_radix_blocks);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        mem_ptr->sub_streams_1, q3_gpu_0, q3_gpu_0, bsks, ksks,
        mem_ptr->message_extract_lut_2, 1);
    streams.synchronize();
@@ -469,7 +469,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
 }

 template <typename Torus>
-__host__ void host_unsigned_integer_div_rem_kb(
+__host__ void host_unsigned_integer_div_rem(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, void *const *bsks,
@@ -486,7 +486,7 @@ __host__ void host_unsigned_integer_div_rem_kb(

  if (mem_ptr->params.message_modulus == 4 &&
      mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
-    host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
+    host_unsigned_integer_div_rem_block_by_block_2_2<Torus>(
        streams, quotient, remainder, numerator, divisor, bsks, ksks,
        mem_ptr->div_rem_2_2_mem);
    return;
@@ -587,7 +587,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
          &last_interesting_divisor_block, interesting_divisor,
          interesting_divisor->num_radix_blocks - 1,
          interesting_divisor->num_radix_blocks);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, &last_interesting_divisor_block,
          &last_interesting_divisor_block, bsks, ksks,
          mem_ptr->masking_luts_1[shifted_mask], 1);
@@ -614,7 +614,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      // the estimated degree of the output is < msg_modulus
      shifted_mask = shifted_mask & full_message_mask;

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
          mem_ptr->masking_luts_2[shifted_mask], 1);
    }; // trim_first_divisor_ms_bits
@@ -636,7 +636,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
          streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
          interesting_remainder1, 0);

-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+      host_logical_scalar_shift_inplace<Torus>(
          streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
          interesting_remainder1->num_radix_blocks);

@@ -665,7 +665,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    }; // left_shift_interesting_remainder1

    auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+      host_logical_scalar_shift_inplace<Torus>(
          streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
          interesting_remainder2->num_radix_blocks);
    }; // left_shift_interesting_remainder2
@@ -773,7 +773,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // fills:
    //  `cleaned_merged_interesting_remainder` - radix ciphertext
    auto create_clean_version_of_merged_remainder = [&](CudaStreams streams) {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, cleaned_merged_interesting_remainder,
          cleaned_merged_interesting_remainder, bsks, ksks,
          mem_ptr->message_extract_lut_1,
@@ -811,7 +811,7 @@ __host__ void host_unsigned_integer_div_rem_kb(

    auto conditionally_zero_out_merged_interesting_remainder =
        [&](CudaStreams streams) {
-          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          integer_radix_apply_bivariate_lookup_table<Torus>(
              streams, cleaned_merged_interesting_remainder,
              cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
              ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
@@ -820,7 +820,7 @@ __host__ void host_unsigned_integer_div_rem_kb(

    auto conditionally_zero_out_merged_new_remainder =
        [&](CudaStreams streams) {
-          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          integer_radix_apply_bivariate_lookup_table<Torus>(
              streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
              ksks, mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
              new_remainder->num_radix_blocks, factor);
@@ -828,7 +828,7 @@ __host__ void host_unsigned_integer_div_rem_kb(

    auto set_quotient_bit = [&](CudaStreams streams) {
      uint32_t block_of_bit = i / num_bits_in_message;
-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table<Torus>(
          streams, mem_ptr->did_not_overflow, subtraction_overflowed,
          at_least_one_upper_block_is_non_zero, bsks, ksks,
          mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
@@ -887,10 +887,10 @@ __host__ void host_unsigned_integer_div_rem_kb(

  streams.synchronize();

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
      mem_ptr->message_extract_lut_1, num_blocks);
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
      mem_ptr->message_extract_lut_2, num_blocks);

@@ -899,7 +899,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
 }

 template <typename Torus>
-__host__ void host_integer_div_rem_kb(
+__host__ void host_integer_div_rem(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
@@ -927,15 +927,15 @@ __host__ void host_integer_div_rem_kb(

    streams.synchronize();

-    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
-                               bsks, ksks, int_mem_ptr->abs_mem_1, true);
-    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
-                               bsks, ksks, int_mem_ptr->abs_mem_2, true);
+    host_integer_abs<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
+                            bsks, ksks, int_mem_ptr->abs_mem_1, true);
+    host_integer_abs<Torus>(int_mem_ptr->sub_streams_2, positive_divisor, bsks,
+                            ksks, int_mem_ptr->abs_mem_2, true);

    int_mem_ptr->sub_streams_1.synchronize();
    int_mem_ptr->sub_streams_2.synchronize();

-    host_unsigned_integer_div_rem_kb<Torus>(
+    host_unsigned_integer_div_rem<Torus>(
        int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
        positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);

@@ -945,7 +945,7 @@ __host__ void host_integer_div_rem_kb(
    CudaRadixCiphertextFFI divisor_sign;
    as_radix_ciphertext_slice<Torus>(&divisor_sign, divisor, num_blocks - 1,
                                     num_blocks);
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
        &numerator_sign, &divisor_sign, bsks, ksks,
        int_mem_ptr->compare_signed_bits_lut, 1,
@@ -954,7 +954,7 @@ __host__ void host_integer_div_rem_kb(
    int_mem_ptr->sub_streams_1.synchronize();
    int_mem_ptr->sub_streams_2.synchronize();

-    host_integer_radix_negation<Torus>(
+    host_negation<Torus>(
        int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, quotient,
        radix_params.message_modulus, radix_params.carry_modulus, num_blocks);

@@ -965,7 +965,7 @@ __host__ void host_integer_div_rem_kb(
                                       nullptr, int_mem_ptr->scp_mem_1, bsks,
                                       ksks, requested_flag, uses_carry);

-    host_integer_radix_negation<Torus>(
+    host_negation<Torus>(
        int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
        radix_params.message_modulus, radix_params.carry_modulus, num_blocks);

@@ -974,22 +974,21 @@ __host__ void host_integer_div_rem_kb(
                                       nullptr, int_mem_ptr->scp_mem_2, bsks,
                                       ksks, requested_flag, uses_carry);

-    host_integer_radix_cmux_kb<Torus>(
-        int_mem_ptr->sub_streams_1, quotient,
-        int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
-        quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
+    host_cmux<Torus>(int_mem_ptr->sub_streams_1, quotient,
+                     int_mem_ptr->sign_bits_are_different,
+                     int_mem_ptr->negated_quotient, quotient,
+                     int_mem_ptr->cmux_quotient_mem, bsks, ksks);

-    host_integer_radix_cmux_kb<Torus>(
-        int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
-        int_mem_ptr->negated_remainder, remainder,
-        int_mem_ptr->cmux_remainder_mem, bsks, ksks);
+    host_cmux<Torus>(int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
+                     int_mem_ptr->negated_remainder, remainder,
+                     int_mem_ptr->cmux_remainder_mem, bsks, ksks);

    int_mem_ptr->sub_streams_1.synchronize();
    int_mem_ptr->sub_streams_2.synchronize();
  } else {
-    host_unsigned_integer_div_rem_kb<Torus>(streams, quotient, remainder,
-                                            numerator, divisor, bsks, ksks,
-                                            int_mem_ptr->unsigned_mem);
+    host_unsigned_integer_div_rem<Torus>(streams, quotient, remainder,
+                                         numerator, divisor, bsks, ksks,
+                                         int_mem_ptr->unsigned_mem);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
@@ -1,6 +1,6 @@
 #include "ilog2.cuh"

-uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
+uint64_t scratch_integer_count_of_consecutive_bits_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -26,7 +26,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
 // the leading or trailing end of an encrypted integer. The final count is
 // stored in the output ciphertext.
 //
-void cuda_integer_count_of_consecutive_bits_kb_64(
+void cuda_integer_count_of_consecutive_bits_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks) {
@@ -37,8 +37,8 @@ void cuda_integer_count_of_consecutive_bits_kb_64(
      (uint64_t **)ksks);
 }

-void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
+                                                       int8_t **mem_ptr_void) {

  int_count_of_consecutive_bits_buffer<uint64_t> *mem_ptr =
      (int_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr_void);
@@ -49,7 +49,7 @@ void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_integer_ilog2_kb_64(
+uint64_t scratch_integer_ilog2_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -74,7 +74,7 @@ uint64_t scratch_integer_ilog2_kb_64(
 // This is equivalent to finding the position of the most significant bit.
 // The result is stored in the output ciphertext.
 //
-void cuda_integer_ilog2_kb_64(
+void cuda_integer_ilog2_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct,
    CudaRadixCiphertextFFI const *trivial_ct_neg_n,
@@ -88,8 +88,8 @@ void cuda_integer_ilog2_kb_64(
      (uint64_t **)ksks);
 }

-void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
-                                      int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
+                                   int8_t **mem_ptr_void) {

  int_ilog2_buffer<uint64_t> *mem_ptr =
      (int_ilog2_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cuh
@@ -14,8 +14,8 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(

  auto tmp = mem_ptr->tmp_ct;

-  host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
-                                      mem_ptr->univ_lut_mem, ksks, bsks);
+  host_apply_univariate_lut<Torus>(streams, tmp, ciphertext,
+                                   mem_ptr->univ_lut_mem, ksks, bsks);

  if (mem_ptr->direction == Leading) {
    host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
@@ -72,7 +72,7 @@ __host__ void host_integer_count_of_consecutive_bits(
        output_start_index + 1, ct_prepared, i, i + 1);
  }

-  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
+  host_integer_partial_sum_ciphertexts_vec<Torus>(
      streams, output_ct, cts, bsks, ksks, mem_ptr->sum_mem, counter_num_blocks,
      ct_prepared->num_radix_blocks);

@@ -141,19 +141,19 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,

  // Perform a partial sum of all the elements without carry propagation.
  //
-  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
+  host_integer_partial_sum_ciphertexts_vec<Torus>(
      streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
      ksks, mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
      mem_ptr->input_num_blocks + 1);

  // Apply luts to the partial sum.
  //
-  host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->message_blocks_not,
-                                      mem_ptr->sum_output_not_propagated,
-                                      mem_ptr->lut_message_not, ksks, bsks);
-  host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->carry_blocks_not,
-                                      mem_ptr->sum_output_not_propagated,
-                                      mem_ptr->lut_carry_not, ksks, bsks);
+  host_apply_univariate_lut<Torus>(streams, mem_ptr->message_blocks_not,
+                                   mem_ptr->sum_output_not_propagated,
+                                   mem_ptr->lut_message_not, ksks, bsks);
+  host_apply_univariate_lut<Torus>(streams, mem_ptr->carry_blocks_not,
+                                   mem_ptr->sum_output_not_propagated,
+                                   mem_ptr->lut_carry_not, ksks, bsks);

  // Left-shift the bitwise-negated carry blocks by one position.
  //
@@ -190,7 +190,7 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
      2 * mem_ptr->counter_num_blocks, 3 * mem_ptr->counter_num_blocks,
      trivial_ct_2, 0, mem_ptr->counter_num_blocks);

-  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
+  host_integer_partial_sum_ciphertexts_vec<Torus>(
      streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks, mem_ptr->sum_mem,
      mem_ptr->counter_num_blocks, 3);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -43,7 +43,7 @@ void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -56,12 +56,12 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
+  return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
      CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
      num_blocks, params, requested_flag, allocate_gpu_memory);
 }

-uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -74,12 +74,12 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
+  return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
      CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
      num_blocks, params, requested_flag, allocate_gpu_memory);
 }

-uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -97,7 +97,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
      num_blocks, params, compute_overflow, allocate_gpu_memory);
 }

-void cuda_propagate_single_carry_kb_64_inplace(
+void cuda_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
@@ -109,7 +109,7 @@ void cuda_propagate_single_carry_kb_64_inplace(
      requested_flag, uses_carry);
 }

-void cuda_add_and_propagate_single_carry_kb_64_inplace(
+void cuda_add_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
@@ -121,7 +121,7 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(
      requested_flag, uses_carry);
 }

-void cuda_integer_overflowing_sub_kb_64_inplace(
+void cuda_integer_overflowing_sub_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array,
    CudaRadixCiphertextFFI *overflow_block,
@@ -168,7 +168,7 @@ void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
  POP_RANGE()
 }

-uint64_t scratch_cuda_apply_univariate_lut_kb_64(
+uint64_t scratch_cuda_apply_univariate_lut_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -182,13 +182,13 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);

-  return scratch_cuda_apply_univariate_lut_kb<uint64_t>(
+  return scratch_cuda_apply_univariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      lut_degree, allocate_gpu_memory);
 }

-uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
+uint64_t scratch_cuda_apply_many_univariate_lut_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -202,24 +202,25 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);

-  return scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
+  return scratch_cuda_apply_many_univariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      num_many_lut, lut_degree, allocate_gpu_memory);
 }

-void cuda_apply_univariate_lut_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks) {
+void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
+                                  CudaRadixCiphertextFFI *output_radix_lwe,
+                                  CudaRadixCiphertextFFI const *input_radix_lwe,
+                                  int8_t *mem_ptr, void *const *ksks,
+                                  void *const *bsks) {

-  host_apply_univariate_lut_kb<uint64_t>(
+  host_apply_univariate_lut<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
 }

-void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
-                                             int8_t **mem_ptr_void) {
+void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup univar lut")
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release(CudaStreams(streams));
@@ -228,19 +229,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
  POP_RANGE()
 }

-void cuda_apply_many_univariate_lut_kb_64(
+void cuda_apply_many_univariate_lut_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks, uint32_t num_many_lut,
    uint32_t lut_stride) {

-  host_apply_many_univariate_lut_kb<uint64_t>(
+  host_apply_many_univariate_lut<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
      num_many_lut, lut_stride);
 }

-uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
+uint64_t scratch_cuda_apply_bivariate_lut_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -254,27 +255,27 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);

-  return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
+  return scratch_cuda_apply_bivariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      lut_degree, allocate_gpu_memory);
 }

-void cuda_apply_bivariate_lut_kb_64(
+void cuda_apply_bivariate_lut_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe_1,
    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
    uint32_t shift) {

-  host_apply_bivariate_lut_kb<uint64_t>(
+  host_apply_bivariate_lut<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
      input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
      (uint64_t **)(ksks), bsks, num_radix_blocks, shift);
 }

-void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
-                                            int8_t **mem_ptr_void) {
+void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
+                                         int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup bivar lut")
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release(CudaStreams(streams));
@@ -298,7 +299,7 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);

-  return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
+  return scratch_cuda_apply_bivariate_lut<uint64_t>(
      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
      lut_degree, allocate_gpu_memory);
@@ -360,7 +361,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
  return size_tracker;
 }

-uint64_t scratch_cuda_apply_noise_squashing_kb(
+uint64_t scratch_cuda_apply_noise_squashing(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
@@ -381,20 +382,21 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
      original_num_blocks, allocate_gpu_memory);
 }

-void cuda_apply_noise_squashing_kb(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks) {
+void cuda_apply_noise_squashing(CudaStreamsFFI streams,
+                                CudaRadixCiphertextFFI *output_radix_lwe,
+                                CudaRadixCiphertextFFI const *input_radix_lwe,
+                                int8_t *mem_ptr, void *const *ksks,
+                                void *const *bsks) {

  PUSH_RANGE("apply noise squashing")
-  integer_radix_apply_noise_squashing_kb<uint64_t>(
+  integer_radix_apply_noise_squashing<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
      (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
  POP_RANGE()
 }

-void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void) {
+void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
+                                        int8_t **mem_ptr_void) {
  PUSH_RANGE("cleanup noise squashing")
  int_noise_squashing_lut<uint64_t> *mem_ptr =
      (int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -503,7 +503,7 @@ __host__ void host_pack_bivariate_blocks_with_single_block(
 /// LUT In scalar bitops we use a number of blocks that may be lower or equal to
 /// the input and output numbers of blocks
 template <typename Torus>
-__host__ void integer_radix_apply_univariate_lookup_table_kb(
+__host__ void integer_radix_apply_univariate_lookup_table(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
@@ -607,7 +607,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
 }

 template <typename Torus>
-__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
+__host__ void integer_radix_apply_many_univariate_lookup_table(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
@@ -710,7 +710,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
 }

 template <typename Torus>
-__host__ void integer_radix_apply_bivariate_lookup_table_kb(
+__host__ void integer_radix_apply_bivariate_lookup_table(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
@@ -1279,7 +1279,7 @@ void host_compute_shifted_blocks_and_states(
  auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
  auto luts_array_first_step = mem->luts_array_first_step;

-  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_many_univariate_lookup_table<Torus>(
      streams, shifted_blocks_and_states, lwe_array, bsks, ksks,
      luts_array_first_step, num_many_lut, lut_stride);

@@ -1347,7 +1347,7 @@ void host_resolve_group_carries_sequentially(
      as_radix_ciphertext_slice<Torus>(&shifted_group_resolved_carries,
                                       group_resolved_carries, 1,
                                       blocks_to_solve + 1);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, &shifted_group_resolved_carries,
          &shifted_group_resolved_carries, bsks, ksks, luts_sequential,
          blocks_to_solve);
@@ -1388,7 +1388,7 @@ void host_compute_prefix_sum_hillis_steele(
    auto prev_blocks = generates_or_propagates;
    int cur_total_blocks = num_radix_blocks - space;

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks, luts,
        cur_total_blocks, luts->params.message_modulus);

@@ -1426,11 +1426,11 @@ void host_compute_propagation_simulators_and_group_carries(
      block_states, num_radix_blocks, group_size);

  auto luts_array_second_step = mem->luts_array_second_step;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, propagation_cum_sums, propagation_cum_sums, bsks, ksks,
      luts_array_second_step, num_radix_blocks);

-  host_integer_radix_scalar_addition_inplace<Torus>(
+  host_scalar_addition_inplace<Torus>(
      streams, propagation_cum_sums, mem->scalar_array_cum_sum,
      mem->h_scalar_array_cum_sum, num_radix_blocks, message_modulus,
      carry_modulus);
@@ -1478,7 +1478,7 @@ void host_compute_shifted_blocks_and_borrow_states(
  auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
  auto luts_array_first_step = mem->luts_array_first_step;

-  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_many_univariate_lookup_table<Torus>(
      streams, shifted_blocks_and_borrow_states, lwe_array, bsks, ksks,
      luts_array_first_step, num_many_lut, lut_stride);

@@ -1682,7 +1682,7 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
          num_radix_blocks);
    }
  }
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
      effective_num_radix_blocks);
 }
@@ -1738,7 +1738,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                         signs_a, num_sign_blocks, message_modulus);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, signs_a, signs_b, bsks, ksks, lut, num_sign_blocks / 2);

      if (num_sign_blocks % 2 == 1)
@@ -1768,7 +1768,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,

    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                       signs_a, num_sign_blocks, message_modulus);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, signs_array_out, signs_b, bsks, ksks, lut, 1);

  } else {
@@ -1786,13 +1786,13 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
        diff_buffer->preallocated_h_lut2);
    lut->broadcast_lut(lut->active_streams);

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
  }
 }

 template <typename Torus>
-uint64_t scratch_cuda_apply_univariate_lut_kb(
+uint64_t scratch_cuda_apply_univariate_lut(
    CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
    uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
    bool allocate_gpu_memory) {
@@ -1814,19 +1814,19 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
 }

 template <typename Torus>
-void host_apply_univariate_lut_kb(CudaStreams streams,
-                                  CudaRadixCiphertextFFI *radix_lwe_out,
-                                  CudaRadixCiphertextFFI const *radix_lwe_in,
-                                  int_radix_lut<Torus> *mem, Torus *const *ksks,
-                                  void *const *bsks) {
+void host_apply_univariate_lut(CudaStreams streams,
+                               CudaRadixCiphertextFFI *radix_lwe_out,
+                               CudaRadixCiphertextFFI const *radix_lwe_in,
+                               int_radix_lut<Torus> *mem, Torus *const *ksks,
+                               void *const *bsks) {

-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
      radix_lwe_out->num_radix_blocks);
 }

 template <typename Torus>
-uint64_t scratch_cuda_apply_many_univariate_lut_kb(
+uint64_t scratch_cuda_apply_many_univariate_lut(
    CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
    uint32_t num_radix_blocks, int_radix_params params, uint32_t num_many_lut,
    uint64_t lut_degree, bool allocate_gpu_memory) {
@@ -1849,19 +1849,21 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
 }

 template <typename Torus>
-void host_apply_many_univariate_lut_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
-    Torus *const *ksks, void *const *bsks, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+void host_apply_many_univariate_lut(CudaStreams streams,
+                                    CudaRadixCiphertextFFI *radix_lwe_out,
+                                    CudaRadixCiphertextFFI const *radix_lwe_in,
+                                    int_radix_lut<Torus> *mem,
+                                    Torus *const *ksks, void *const *bsks,
+                                    uint32_t num_many_lut,
+                                    uint32_t lut_stride) {

-  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_many_univariate_lookup_table<Torus>(
      streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem, num_many_lut,
      lut_stride);
 }

 template <typename Torus>
-uint64_t scratch_cuda_apply_bivariate_lut_kb(
+uint64_t scratch_cuda_apply_bivariate_lut(
    CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
    uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
    bool allocate_gpu_memory) {
@@ -1883,21 +1885,21 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
 }

 template <typename Torus>
-void host_apply_bivariate_lut_kb(CudaStreams streams,
-                                 CudaRadixCiphertextFFI *radix_lwe_out,
-                                 CudaRadixCiphertextFFI const *radix_lwe_in_1,
-                                 CudaRadixCiphertextFFI const *radix_lwe_in_2,
-                                 int_radix_lut<Torus> *mem, Torus *const *ksks,
-                                 void *const *bsks, uint32_t num_radix_blocks,
-                                 uint32_t shift) {
+void host_apply_bivariate_lut(CudaStreams streams,
+                              CudaRadixCiphertextFFI *radix_lwe_out,
+                              CudaRadixCiphertextFFI const *radix_lwe_in_1,
+                              CudaRadixCiphertextFFI const *radix_lwe_in_2,
+                              int_radix_lut<Torus> *mem, Torus *const *ksks,
+                              void *const *bsks, uint32_t num_radix_blocks,
+                              uint32_t shift) {

-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table<Torus>(
      streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
      num_radix_blocks, shift);
 }

 template <typename Torus>
-uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
+uint64_t scratch_cuda_propagate_single_carry_inplace(
    CudaStreams streams, int_sc_prop_memory<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
    bool allocate_gpu_memory) {
@@ -1992,7 +1994,7 @@ void host_propagate_single_carry(CudaStreams streams,
    copy_radix_ciphertext_slice_async<Torus>(
        streams.stream(0), streams.gpu_index(0), prepared_blocks,
        num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, mem->output_flag, prepared_blocks, bsks, ksks,
        mem->lut_message_extract, num_radix_blocks + 1);

@@ -2004,7 +2006,7 @@ void host_propagate_single_carry(CudaStreams streams,
        mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
  } else {
    auto message_extract = mem->lut_message_extract;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, lwe_array, prepared_blocks, bsks, ksks, message_extract,
        num_radix_blocks);
  }
@@ -2077,7 +2079,7 @@ void host_add_and_propagate_single_carry(
  auto block_states = mem->shifted_blocks_state_mem->block_states;
  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, &output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks,
        lut_overflow_prep, 1, lut_overflow_prep->params.message_modulus);
  } else if (requested_flag == outputFlag::FLAG_CARRY) {
@@ -2140,7 +2142,7 @@ void host_add_and_propagate_single_carry(
    copy_radix_ciphertext_slice_async<Torus>(
        streams.stream(0), streams.gpu_index(0), prepared_blocks,
        num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, mem->output_flag, prepared_blocks, bsks, ksks,
        mem->lut_message_extract, num_radix_blocks + 1);

@@ -2152,7 +2154,7 @@ void host_add_and_propagate_single_carry(
        streams.stream(0), streams.gpu_index(0), carry_out, 0, 1,
        mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
  } else {
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, lhs_array, prepared_blocks, bsks, ksks,
        mem->lut_message_extract, num_radix_blocks);
  }
@@ -2227,8 +2229,8 @@ void host_single_borrow_propagate(CudaStreams streams,
                          (Torus *)prepared_blocks->ptr, shifted_blocks,
                          simulators, big_lwe_dimension, num_radix_blocks);

-  host_integer_radix_add_scalar_one_inplace<Torus>(
-      streams, prepared_blocks, message_modulus, carry_modulus);
+  host_add_scalar_one_inplace<Torus>(streams, prepared_blocks, message_modulus,
+                                     carry_modulus);

  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    CudaRadixCiphertextFFI shifted_simulators;
@@ -2268,7 +2270,7 @@ void host_single_borrow_propagate(CudaStreams streams,

  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    auto borrow_flag = mem->lut_borrow_flag;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
        borrow_flag, 1);
  }
@@ -2290,7 +2292,7 @@ void host_single_borrow_propagate(CudaStreams streams,
      mem->group_size);

  auto message_extract = mem->lut_message_extract;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
      message_extract, num_radix_blocks);

@@ -2308,7 +2310,7 @@ void host_single_borrow_propagate(CudaStreams streams,
 /// LUT In scalar bitops we use a number of blocks that may be lower or equal to
 /// the input and output numbers of blocks
 template <typename InputTorus>
-__host__ void integer_radix_apply_noise_squashing_kb(
+__host__ void integer_radix_apply_noise_squashing(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -65,7 +65,7 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
 * This scratch function allocates the necessary amount of data on the GPU for
 * the integer radix multiplication in keyswitch->bootstrap order.
 */
-uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
    bool const is_boolean_right, uint32_t message_modulus,
    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
@@ -87,7 +87,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
  case 4096:
  case 8192:
  case 16384:
-    return scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
+    return scratch_cuda_integer_mult_radix_ciphertext<uint64_t>(
        CudaStreams(streams), (int_mul_memory<uint64_t> **)mem_ptr,
        is_boolean_left, is_boolean_right, num_radix_blocks, params,
        allocate_gpu_memory);
@@ -124,7 +124,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 * ciphertext
 * - 'pbs_type' selects which PBS implementation should be used
 */
-void cuda_integer_mult_radix_ciphertext_kb_64(
+void cuda_integer_mult_radix_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
@@ -133,43 +133,43 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
  PUSH_RANGE("mul")
  switch (polynomial_size) {
  case 256:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 512:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 1024:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 2048:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 4096:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 8192:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 16384:
-    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
+    host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
@@ -192,7 +192,7 @@ void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
  POP_RANGE()
 }

-uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -207,30 +207,31 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);
-  return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
+  return scratch_cuda_integer_partial_sum_ciphertexts_vec<uint64_t>(
      CudaStreams(streams),
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
      max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
      allocate_gpu_memory);
 }

-void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
+                                         CudaRadixCiphertextFFI *radix_lwe_out,
+                                         CudaRadixCiphertextFFI *radix_lwe_vec,
+                                         int8_t *mem_ptr, void *const *bsks,
+                                         void *const *ksks) {

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
+  host_integer_partial_sum_ciphertexts_vec<uint64_t>(
      CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
      (uint64_t **)(ksks), mem, radix_lwe_out->num_radix_blocks,
      radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
 }

-void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void) {
+void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void) {
  int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -268,7 +268,7 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
+__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec(
    CudaStreams streams, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
@@ -283,7 +283,7 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
 }

 template <typename Torus>
-__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
+__host__ void host_integer_partial_sum_ciphertexts_vec(
    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
@@ -412,7 +412,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
      luts_message_carry->broadcast_lut(active_streams, false);
      luts_message_carry->using_trivial_lwe_indexes = false;

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, current_blocks, current_blocks, bsks, ksks,
          luts_message_carry, total_ciphertexts);
    }
@@ -463,7 +463,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
      luts_message_carry->broadcast_lut(active_streams, false);
      luts_message_carry->using_trivial_lwe_indexes = false;

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          active_streams, current_blocks, radix_lwe_out, bsks, ksks,
          luts_message_carry, num_blocks_in_apply_lut);
    }
@@ -483,7 +483,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
 }

 template <typename Torus, class params>
-__host__ void host_integer_mult_radix_kb(
+__host__ void host_integer_mult_radix(
    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
@@ -580,7 +580,7 @@ __host__ void host_integer_mult_radix_kb(
      (Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks);
  check_cuda_error(cudaGetLastError());

-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table<Torus>(
      streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
      luts_array, total_block_count, luts_array->params.message_modulus);

@@ -608,7 +608,7 @@ __host__ void host_integer_mult_radix_kb(
    size_t b_id = i % num_blocks;
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
-  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
+  host_integer_partial_sum_ciphertexts_vec<Torus>(
      streams, radix_lwe_out, vector_result_sb, bsks, ksks,
      mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);

@@ -621,7 +621,7 @@ __host__ void host_integer_mult_radix_kb(
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
+__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext(
    CudaStreams streams, int_mul_memory<Torus> **mem_ptr,
    bool const is_boolean_left, bool const is_boolean_right,
    uint32_t num_radix_blocks, int_radix_params params,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,11 +1,11 @@
 #include "integer/negation.cuh"

-void cuda_negate_integer_radix_ciphertext_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t num_radix_blocks) {
+void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
+                               CudaRadixCiphertextFFI *lwe_array_out,
+                               CudaRadixCiphertextFFI const *lwe_array_in,
+                               uint32_t message_modulus, uint32_t carry_modulus,
+                               uint32_t num_radix_blocks) {

-  host_integer_radix_negation<uint64_t>(CudaStreams(streams), lwe_array_out,
-                                        lwe_array_in, message_modulus,
-                                        carry_modulus, num_radix_blocks);
+  host_negation<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_in,
+                          message_modulus, carry_modulus, num_radix_blocks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -17,10 +17,9 @@
 #include <vector>

 template <typename Torus>
-__global__ void
-device_integer_radix_negation(Torus *output, Torus const *input,
-                              int32_t num_blocks, uint64_t lwe_dimension,
-                              uint64_t message_modulus, uint64_t delta) {
+__global__ void device_negation(Torus *output, Torus const *input,
+                                int32_t num_blocks, uint64_t lwe_dimension,
+                                uint64_t message_modulus, uint64_t delta) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < lwe_dimension + 1) {
    bool is_body = (tid == lwe_dimension);
@@ -49,10 +48,11 @@ device_integer_radix_negation(Torus *output, Torus const *input,
 }

 template <typename Torus>
-__host__ void host_integer_radix_negation(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint64_t message_modulus,
-    uint64_t carry_modulus, uint32_t num_radix_blocks) {
+__host__ void host_negation(CudaStreams streams,
+                            CudaRadixCiphertextFFI *lwe_array_out,
+                            CudaRadixCiphertextFFI const *lwe_array_in,
+                            uint64_t message_modulus, uint64_t carry_modulus,
+                            uint32_t num_radix_blocks) {
  cuda_set_device(streams.gpu_index(0));

  if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
@@ -80,7 +80,7 @@ __host__ void host_integer_radix_negation(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
+  device_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
      static_cast<Torus *>(lwe_array_out->ptr),
      static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
      message_modulus, delta);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
@@ -21,11 +21,11 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
      allocate_gpu_memory);
 }

-void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
-                                        CudaRadixCiphertextFFI *radix_lwe_out,
-                                        const void *seeded_lwe_input,
-                                        uint32_t num_blocks_to_process,
-                                        int8_t *mem, void *const *bsks) {
+void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
+                                  CudaRadixCiphertextFFI *radix_lwe_out,
+                                  const void *seeded_lwe_input,
+                                  uint32_t num_blocks_to_process, int8_t *mem,
+                                  void *const *bsks) {

  host_integer_grouped_oprf<uint64_t>(
      CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -1,11 +1,11 @@
 #include "integer/scalar_addition.cuh"

-void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
+void cuda_scalar_addition_ciphertext_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
    uint32_t message_modulus, uint32_t carry_modulus) {

-  host_integer_radix_scalar_addition_inplace<uint64_t>(
+  host_scalar_addition_inplace<uint64_t>(
      CudaStreams(streams), lwe_array,
      static_cast<const uint64_t *>(scalar_input),
      static_cast<const uint64_t *>(h_scalar_input), num_scalars,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -12,9 +12,10 @@
 #include <stdio.h>

 template <typename Torus>
-__global__ void device_integer_radix_scalar_addition_inplace(
-    Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks,
-    uint32_t lwe_dimension, uint64_t delta) {
+__global__ void
+device_scalar_addition_inplace(Torus *lwe_array, Torus const *scalar_input,
+                               int32_t num_blocks, uint32_t lwe_dimension,
+                               uint64_t delta) {

  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < num_blocks) {
@@ -24,7 +25,7 @@ __global__ void device_integer_radix_scalar_addition_inplace(
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_addition_inplace(
+__host__ void host_scalar_addition_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
    Torus const *scalar_input, Torus const *h_scalar_input,
    uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
@@ -45,10 +46,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_addition_inplace<Torus>
-      <<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
-                                             scalar_input, num_scalars,
-                                             lwe_array->lwe_dimension, delta);
+  device_scalar_addition_inplace<Torus><<<grid, thds, 0, streams.stream(0)>>>(
+      (Torus *)lwe_array->ptr, scalar_input, num_scalars,
+      lwe_array->lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
  for (uint i = 0; i < num_scalars; i++) {
    lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
 }

 template <typename Torus>
-__global__ void device_integer_radix_add_scalar_one_inplace(
-    Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
-    uint64_t delta) {
+__global__ void
+device_add_scalar_one_inplace(Torus *lwe_array, int32_t num_blocks,
+                              uint32_t lwe_dimension, uint64_t delta) {

  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < num_blocks) {
@@ -68,9 +68,10 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
 }

 template <typename Torus>
-__host__ void host_integer_radix_add_scalar_one_inplace(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
-    uint32_t message_modulus, uint32_t carry_modulus) {
+__host__ void host_add_scalar_one_inplace(CudaStreams streams,
+                                          CudaRadixCiphertextFFI *lwe_array,
+                                          uint32_t message_modulus,
+                                          uint32_t carry_modulus) {
  cuda_set_device(streams.gpu_index(0));

  // Create a 1-dimensional grid of threads
@@ -85,10 +86,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_add_scalar_one_inplace<Torus>
-      <<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
-                                             lwe_array->num_radix_blocks,
-                                             lwe_array->lwe_dimension, delta);
+  device_add_scalar_one_inplace<Torus><<<grid, thds, 0, streams.stream(0)>>>(
+      (Torus *)lwe_array->ptr, lwe_array->num_radix_blocks,
+      lwe_array->lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
  for (uint i = 0; i < lwe_array->num_radix_blocks; i++) {
    lwe_array->degrees[i] = lwe_array->degrees[i] + 1;
@@ -96,9 +96,10 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
 }

 template <typename Torus>
-__global__ void device_integer_radix_scalar_subtraction_inplace(
-    Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
-    uint32_t lwe_dimension, uint64_t delta) {
+__global__ void
+device_scalar_subtraction_inplace(Torus *lwe_array, Torus *scalar_input,
+                                  int32_t num_blocks, uint32_t lwe_dimension,
+                                  uint64_t delta) {

  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < num_blocks) {
@@ -110,7 +111,7 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_subtraction_inplace(
+__host__ void host_scalar_subtraction_inplace(
    CudaStreams streams, Torus *lwe_array, Torus *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
@@ -128,7 +129,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_subtraction_inplace<Torus>
+  device_scalar_subtraction_inplace<Torus>
      <<<grid, thds, 0, streams.stream(0)>>>(lwe_array, scalar_input,
                                             input_lwe_ciphertext_count,
                                             lwe_dimension, delta);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -1,12 +1,12 @@
 #include "integer/scalar_bitops.cuh"

-void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
+void cuda_scalar_bitop_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
    void *const *bsks, void *const *ksks) {

-  host_integer_radix_scalar_bitop_kb<uint64_t>(
+  host_scalar_bitop<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_input,
      static_cast<const uint64_t *>(clear_blocks),
      static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -4,11 +4,12 @@
 #include "integer/bitwise_ops.cuh"

 template <typename Torus>
-__host__ void host_integer_radix_scalar_bitop_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
-    Torus const *h_clear_blocks, uint32_t num_clear_blocks,
-    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
+__host__ void
+host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
+                  CudaRadixCiphertextFFI const *input,
+                  Torus const *clear_blocks, Torus const *h_clear_blocks,
+                  uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
+                  void *const *bsks, Torus *const *ksks) {

  if (output->num_radix_blocks != input->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be equal")
@@ -47,7 +48,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
    auto active_streams = streams.active_gpu_subset(num_clear_blocks);
    lut->broadcast_lut(active_streams, false);

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, output, input, bsks, ksks, lut, num_clear_blocks);
    memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -31,7 +31,7 @@ std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare) {
  return {invert_operands, invert_subtraction_result};
 }

-void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
+void cuda_scalar_comparison_ciphertext_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
    void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
@@ -46,7 +46,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
  switch (buffer->op) {
  case EQ:
  case NE:
-    host_integer_radix_scalar_equality_check_kb<uint64_t>(
+    host_scalar_equality_check<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_in,
        static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
        (uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
@@ -58,7 +58,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    if (num_radix_blocks % 2 != 0 && num_radix_blocks != 1)
      PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
            "to be even or equal to 1.")
-    host_integer_radix_scalar_difference_check_kb<uint64_t>(
+    host_scalar_difference_check<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_in,
        static_cast<const uint64_t *>(scalar_blocks),
        static_cast<const uint64_t *>(h_scalar_blocks), buffer,
@@ -70,7 +70,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    if (lwe_array_in->num_radix_blocks % 2 != 0)
      PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
            "even.")
-    host_integer_radix_scalar_maxmin_kb<uint64_t>(
+    host_scalar_maxmin<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_in,
        static_cast<const uint64_t *>(scalar_blocks),
        static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -25,11 +25,13 @@ Torus is_x_less_than_y_given_input_borrow(Torus last_x_block,
 }

 template <typename Torus>
-__host__ void scalar_compare_radix_blocks_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void scalar_compare_radix_blocks(CudaStreams streams,
+                                          CudaRadixCiphertextFFI *lwe_array_out,
+                                          CudaRadixCiphertextFFI *lwe_array_in,
+                                          Torus *scalar_blocks,
+                                          int_comparison_buffer<Torus> *mem_ptr,
+                                          void *const *bsks, Torus *const *ksks,
+                                          uint32_t num_radix_blocks) {

  if (num_radix_blocks == 0)
    return;
@@ -62,13 +64,13 @@ __host__ void scalar_compare_radix_blocks_kb(
                                     subtracted_blocks, lwe_array_in);
  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_integer_radix_scalar_subtraction_inplace<Torus>(
+  host_scalar_subtraction_inplace<Torus>(
      streams, (Torus *)subtracted_blocks->ptr, scalar_blocks,
      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);

  // Apply LUT to compare to 0
  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, lwe_array_out, subtracted_blocks, bsks, ksks, sign_lut,
      num_radix_blocks);

@@ -78,12 +80,12 @@ __host__ void scalar_compare_radix_blocks_kb(
  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace<Torus>(
-      streams, lwe_array_out, message_modulus, carry_modulus);
+  host_add_scalar_one_inplace<Torus>(streams, lwe_array_out, message_modulus,
+                                     carry_modulus);
 }

 template <typename Torus>
-__host__ void integer_radix_unsigned_scalar_difference_check_kb(
+__host__ void integer_radix_unsigned_scalar_difference_check(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
@@ -148,7 +150,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    auto active_streams = streams.active_gpu_subset(1);
    lut->broadcast_lut(active_streams);

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);

  } else if (num_scalar_blocks < num_radix_blocks) {
@@ -199,7 +201,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb<Torus>(
+    scalar_compare_radix_blocks<Torus>(
        lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
        mem_ptr, bsks, ksks, num_lsb_radix_blocks);

@@ -242,7 +244,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    auto active_streams = streams.active_gpu_subset(1);
    lut->broadcast_lut(active_streams);

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
        ksks, lut, 1, lut->params.message_modulus);

@@ -276,7 +278,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
      auto active_streams = streams.active_gpu_subset(1);
      one_block_lut->broadcast_lut(active_streams);

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
      one_block_lut->release(streams);
      delete one_block_lut;
@@ -305,7 +307,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
      // - 1 if lhs == rhs
      // - 2 if lhs > rhs
      auto comparisons = mem_ptr->tmp_lwe_array_out;
-      scalar_compare_radix_blocks_kb<Torus>(
+      scalar_compare_radix_blocks<Torus>(
          streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
          mem_ptr, bsks, ksks, num_lsb_radix_blocks);

@@ -321,7 +323,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
 }

 template <typename Torus>
-__host__ void integer_radix_signed_scalar_difference_check_kb(
+__host__ void integer_radix_signed_scalar_difference_check(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
@@ -420,7 +422,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto active_streams = streams.active_gpu_subset(1);
    lut->broadcast_lut(active_streams);

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
        1, lut->params.message_modulus);

@@ -466,7 +468,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb<Torus>(
+    scalar_compare_radix_blocks<Torus>(
        lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
        mem_ptr, bsks, ksks, num_lsb_radix_blocks);

@@ -525,7 +527,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    CudaRadixCiphertextFFI sign_block;
    as_radix_ciphertext_slice<Torus>(
        &sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
        ksks, signed_msb_lut, 1, signed_msb_lut->params.message_modulus);
    lsb_streams.synchronize();
@@ -568,7 +570,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
      auto active_streams = streams.active_gpu_subset(1);
      one_block_lut->broadcast_lut(active_streams);

-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
      one_block_lut->release(streams);
      delete one_block_lut;
@@ -606,7 +608,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
      // - 0 if lhs < rhs
      // - 1 if lhs == rhs
      // - 2 if lhs > rhs
-      scalar_compare_radix_blocks_kb<Torus>(
+      scalar_compare_radix_blocks<Torus>(
          lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
          (Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
      CudaRadixCiphertextFFI encrypted_sign_block;
@@ -622,7 +624,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
          scalar_sign_block, h_scalar_sign_block, 1, message_modulus,
          carry_modulus);

-      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table<Torus>(
          msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
          trivial_sign_block, bsks, ksks, mem_ptr->signed_lut, 1,
          mem_ptr->signed_lut->params.message_modulus);
@@ -639,7 +641,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_difference_check_kb(
+__host__ void host_scalar_difference_check(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
@@ -654,12 +656,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_difference_check_kb<Torus>(
+    integer_radix_signed_scalar_difference_check<Torus>(
        streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
        mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
        num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_difference_check_kb<Torus>(
+    integer_radix_unsigned_scalar_difference_check<Torus>(
        streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
        mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
        num_scalar_blocks);
@@ -667,12 +669,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_maxmin_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
-    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
-    uint32_t num_scalar_blocks) {
+__host__ void
+host_scalar_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+                   CudaRadixCiphertextFFI const *lwe_array_in,
+                   Torus const *scalar_blocks, Torus const *h_scalar_blocks,
+                   int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
+                   Torus *const *ksks, uint32_t num_radix_blocks,
+                   uint32_t num_scalar_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -688,7 +691,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  host_integer_radix_scalar_difference_check_kb<Torus>(
+  host_scalar_difference_check<Torus>(
      streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
      mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks, num_scalar_blocks);

@@ -704,13 +707,13 @@ __host__ void host_integer_radix_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb<Torus>(
-      streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
+  host_cmux<Torus>(streams, lwe_array_out, mem_ptr->tmp_lwe_array_out,
+                   lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                   ksks);
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_equality_check_kb(
+__host__ void host_scalar_equality_check(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -785,7 +788,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    // We use false cause we only will broadcast the indexes
    scalar_comparison_luts->broadcast_lut(active_streams, false);

-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
        bsks, ksks, scalar_comparison_luts, num_halved_lsb_radix_blocks);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -1,6 +1,6 @@
 #include "scalar_div.cuh"

-uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
+uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -21,7 +21,7 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
      scalar_divisor_ffi, allocate_gpu_memory);
 }

-void cuda_integer_unsigned_scalar_div_radix_kb_64(
+void cuda_integer_unsigned_scalar_div_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    const CudaScalarDivisorFFI *scalar_divisor_ffi) {
@@ -32,8 +32,8 @@ void cuda_integer_unsigned_scalar_div_radix_kb_64(
      scalar_divisor_ffi);
 }

-void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
+                                                       int8_t **mem_ptr_void) {

  int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
      (int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
@@ -44,7 +44,7 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -59,25 +59,25 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);

-  return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
+  return scratch_integer_signed_scalar_div_radix<uint64_t>(
      CudaStreams(streams), params,
      (int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
      scalar_divisor_ffi, allocate_gpu_memory);
 }

-void cuda_integer_signed_scalar_div_radix_kb_64(
+void cuda_integer_signed_scalar_div_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {

-  host_integer_signed_scalar_div_radix_kb<uint64_t>(
+  host_integer_signed_scalar_div_radix<uint64_t>(
      CudaStreams(streams), numerator_ct,
      (int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
      scalar_divisor_ffi, numerator_bits);
 }

-void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
-                                                        int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
+                                                     int8_t **mem_ptr_void) {

  int_signed_scalar_div_mem<uint64_t> *mem_ptr =
      (int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
@@ -88,7 +88,7 @@ void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -110,7 +110,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
 }

-void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+void cuda_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -127,7 +127,7 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
      (uint64_t *)h_clear_blocks, num_clear_blocks);
 }

-void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
@@ -139,7 +139,7 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
  *mem_ptr_void = nullptr;
 }

-uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
+uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -161,7 +161,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
 }

-void cuda_integer_signed_scalar_div_rem_radix_kb_64(
+void cuda_integer_signed_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -176,7 +176,7 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
      decomposed_divisor, num_scalars_divisor, numerator_bits);
 }

-void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
+void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
@@ -35,7 +35,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
  }

  if (scalar_divisor_ffi->is_divisor_pow2) {
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+    host_logical_scalar_shift_inplace<Torus>(
        streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
        mem_ptr->logical_scalar_shift_mem, bsks, ksks,
        numerator_ct->num_radix_blocks);
@@ -63,15 +63,15 @@ __host__ void host_integer_unsigned_scalar_div_radix(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       numerator_cpy, numerator_ct);

-    host_integer_radix_scalar_mul_high_kb<Torus>(
-        streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks, bsks,
-        scalar_divisor_ffi);
+    host_scalar_mul_high<Torus>(streams, numerator_cpy,
+                                mem_ptr->scalar_mul_high_mem, ksks, bsks,
+                                scalar_divisor_ffi);

    host_sub_and_propagate_single_carry<Torus>(
        streams, numerator_ct, numerator_cpy, nullptr, nullptr,
        mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);

-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+    host_logical_scalar_shift_inplace<Torus>(
        streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
        bsks, ksks, numerator_ct->num_radix_blocks);

@@ -79,7 +79,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
        streams, numerator_ct, numerator_cpy, nullptr, nullptr,
        mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);

-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+    host_logical_scalar_shift_inplace<Torus>(
        streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
        mem_ptr->logical_scalar_shift_mem, bsks, ksks,
        numerator_ct->num_radix_blocks);
@@ -87,23 +87,23 @@ __host__ void host_integer_unsigned_scalar_div_radix(
    return;
  }

-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+  host_logical_scalar_shift_inplace<Torus>(
      streams, numerator_ct, scalar_divisor_ffi->shift_pre,
      mem_ptr->logical_scalar_shift_mem, bsks, ksks,
      numerator_ct->num_radix_blocks);

-  host_integer_radix_scalar_mul_high_kb<Torus>(streams, numerator_ct,
-                                               mem_ptr->scalar_mul_high_mem,
-                                               ksks, bsks, scalar_divisor_ffi);
+  host_scalar_mul_high<Torus>(streams, numerator_ct,
+                              mem_ptr->scalar_mul_high_mem, ksks, bsks,
+                              scalar_divisor_ffi);

-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+  host_logical_scalar_shift_inplace<Torus>(
      streams, numerator_ct, scalar_divisor_ffi->shift_post,
      mem_ptr->logical_scalar_shift_mem, bsks, ksks,
      numerator_ct->num_radix_blocks);
 }

 template <typename Torus>
-__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
+__host__ uint64_t scratch_integer_signed_scalar_div_radix(
    CudaStreams streams, int_radix_params params,
    int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
    const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -119,7 +119,7 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
 }

 template <typename Torus>
-__host__ void host_integer_signed_scalar_div_radix_kb(
+__host__ void host_integer_signed_scalar_div_radix(
    CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
    int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
    Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -129,7 +129,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
    if (scalar_divisor_ffi->is_divisor_negative) {
      CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;

-      host_integer_radix_negation<Torus>(
+      host_negation<Torus>(
          streams, tmp, numerator_ct, mem_ptr->params.message_modulus,
          mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);

@@ -152,11 +152,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       tmp, numerator_ct);

-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+    host_arithmetic_scalar_shift_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);

-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+    host_logical_scalar_shift_inplace<Torus>(
        streams, tmp,
        numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
        mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
@@ -165,7 +165,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
        streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
        ksks, FLAG_NONE, (uint32_t)0);

-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+    host_arithmetic_scalar_shift_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);

@@ -173,11 +173,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       tmp, numerator_ct);

-    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
-        streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
-        bsks);
+    host_signed_scalar_mul_high<Torus>(streams, tmp,
+                                       mem_ptr->scalar_mul_high_mem, ksks,
+                                       scalar_divisor_ffi, bsks);

-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+    host_arithmetic_scalar_shift_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->shift_post,
        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);

@@ -185,7 +185,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       xsign, numerator_ct);

-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+    host_arithmetic_scalar_shift_inplace<Torus>(
        streams, xsign, numerator_bits - 1,
        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);

@@ -198,15 +198,15 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       tmp, numerator_ct);

-    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
-        streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
-        bsks);
+    host_signed_scalar_mul_high<Torus>(streams, tmp,
+                                       mem_ptr->scalar_mul_high_mem, ksks,
+                                       scalar_divisor_ffi, bsks);

    host_add_and_propagate_single_carry<Torus>(
        streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
        ksks, FLAG_NONE, (uint32_t)0);

-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+    host_arithmetic_scalar_shift_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->shift_post,
        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);

@@ -214,7 +214,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       xsign, numerator_ct);

-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+    host_arithmetic_scalar_shift_inplace<Torus>(
        streams, xsign, numerator_bits - 1,
        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);

@@ -224,7 +224,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
  }

  if (scalar_divisor_ffi->is_divisor_negative) {
-    host_integer_radix_negation<Torus>(
+    host_negation<Torus>(
        streams, numerator_ct, tmp, mem_ptr->params.message_modulus,
        mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
  } else {
@@ -270,9 +270,9 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(

    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       remainder_ct, numerator_ct);
-    host_integer_radix_scalar_bitop_kb(
-        streams, remainder_ct, remainder_ct, clear_blocks, h_clear_blocks,
-        num_clear_blocks, mem_ptr->bitop_mem, bsks, ksks);
+    host_scalar_bitop(streams, remainder_ct, remainder_ct, clear_blocks,
+                      h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
+                      bsks, ksks);

  } else {
    if (!scalar_divisor_ffi->is_divisor_zero) {
@@ -328,9 +328,9 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                     numerator_ct, quotient_ct);

-  host_integer_signed_scalar_div_radix_kb(streams, quotient_ct,
-                                          mem_ptr->signed_div_mem, bsks, ksks,
-                                          scalar_divisor_ffi, numerator_bits);
+  host_integer_signed_scalar_div_radix(streams, quotient_ct,
+                                       mem_ptr->signed_div_mem, bsks, ksks,
+                                       scalar_divisor_ffi, numerator_bits);

  host_propagate_single_carry<Torus>(streams, quotient_ct, nullptr, nullptr,
                                     mem_ptr->scp_mem, bsks, ksks, FLAG_NONE,
@@ -341,10 +341,10 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       remainder_ct, quotient_ct);

-    host_integer_radix_logical_scalar_shift_kb_inplace(
-        streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks,
-        remainder_ct->num_radix_blocks);
+    host_logical_scalar_shift_inplace(streams, remainder_ct,
+                                      scalar_divisor_ffi->ilog2_divisor,
+                                      mem_ptr->logical_scalar_shift_mem, bsks,
+                                      ksks, remainder_ct->num_radix_blocks);

  } else if (!scalar_divisor_ffi->is_divisor_zero) {
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -1,6 +1,6 @@
 #include "integer/scalar_mul.cuh"

-uint64_t scratch_cuda_integer_scalar_mul_kb_64(
+uint64_t scratch_cuda_integer_scalar_mul_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -14,12 +14,12 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
                          grouping_factor, message_modulus, carry_modulus,
                          noise_reduction_type);

-  return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
+  return scratch_cuda_scalar_mul<uint64_t>(
      CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
      num_blocks, params, num_scalar_bits, allocate_gpu_memory);
 }

-void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
+void cuda_scalar_multiplication_ciphertext_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
    int8_t *mem, void *const *bsks, void *const *ksks, uint32_t polynomial_size,
@@ -31,8 +31,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
      (uint64_t **)(ksks), message_modulus, num_scalars);
 }

-void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void) {
+void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_scalar_mul_buffer<uint64_t> *mem_ptr =
      (int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -30,10 +30,12 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
 }

 template <typename T>
-__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
-    CudaStreams streams, int_scalar_mul_buffer<T> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    uint32_t num_scalar_bits, bool allocate_gpu_memory) {
+__host__ uint64_t scratch_cuda_scalar_mul(CudaStreams streams,
+                                          int_scalar_mul_buffer<T> **mem_ptr,
+                                          uint32_t num_radix_blocks,
+                                          int_radix_params params,
+                                          uint32_t num_scalar_bits,
+                                          bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_scalar_mul_buffer<T>(streams, params, num_radix_blocks,
@@ -67,9 +69,9 @@ __host__ void host_integer_scalar_mul_radix(
      copy_radix_ciphertext_slice_async<T>(
          streams.stream(0), streams.gpu_index(0), &shift_input, 0,
          num_radix_blocks, lwe_array, 0, num_radix_blocks);
-      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
-          streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
-          bsks, ksks, num_radix_blocks);
+      host_logical_scalar_shift_inplace<T>(streams, &shift_input, shift_amount,
+                                           mem->logical_scalar_shift_buffer,
+                                           bsks, ksks, num_radix_blocks);
    } else {
      // create trivial assign for value = 0
      set_zero_radix_ciphertext_slice_async<T>(
@@ -111,7 +113,7 @@ __host__ void host_integer_scalar_mul_radix(
                                             streams.gpu_index(0), lwe_array, 0,
                                             num_radix_blocks);
  } else {
-    host_integer_partial_sum_ciphertexts_vec_kb<T>(
+    host_integer_partial_sum_ciphertexts_vec<T>(
        streams, lwe_array, all_shifted_buffer, bsks, ksks,
        mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);

@@ -166,10 +168,11 @@ __host__ void host_integer_small_scalar_mul_radix(
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_mul_high_kb(
-    CudaStreams streams, CudaRadixCiphertextFFI *ct,
-    int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
-    void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
+__host__ void
+host_scalar_mul_high(CudaStreams streams, CudaRadixCiphertextFFI *ct,
+                     int_scalar_mul_high_buffer<Torus> *mem_ptr,
+                     Torus *const *ksks, void *const *bsks,
+                     const CudaScalarDivisorFFI *scalar_divisor_ffi) {

  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
    set_zero_radix_ciphertext_slice_async<Torus>(
@@ -186,7 +189,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
      tmp_ffi->num_radix_blocks != 0) {

    if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+      host_logical_scalar_shift_inplace<Torus>(
          streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
          tmp_ffi->num_radix_blocks);
@@ -205,7 +208,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_signed_scalar_mul_high_kb(
+__host__ void host_signed_scalar_mul_high(
    CudaStreams streams, CudaRadixCiphertextFFI *ct,
    int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
    const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
@@ -227,7 +230,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
      tmp_ffi->num_radix_blocks != 0) {

    if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+      host_logical_scalar_shift_inplace<Torus>(
          streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
          tmp_ffi->num_radix_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -1,6 +1,6 @@
 #include "scalar_rotate.cuh"

-uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
+uint64_t scratch_cuda_scalar_rotate_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,24 +14,24 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
+  return scratch_cuda_scalar_rotate<uint64_t>(
      CudaStreams(streams),
      (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      shift_type, allocate_gpu_memory);
 }

-void cuda_integer_radix_scalar_rotate_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
+                                   CudaRadixCiphertextFFI *lwe_array,
+                                   uint32_t n, int8_t *mem_ptr,
+                                   void *const *bsks, void *const *ksks) {

-  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
+  host_scalar_rotate_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, n,
      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
      (uint64_t **)(ksks));
 }

-void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
-                                              int8_t **mem_ptr_void) {
+void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -9,7 +9,7 @@
 #include "pbs/programmable_bootstrap_multibit.cuh"

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
+__host__ uint64_t scratch_cuda_scalar_rotate(
    CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -22,10 +22,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_scalar_rotate_kb_inplace(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
-    int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+__host__ void
+host_scalar_rotate_inplace(CudaStreams streams,
+                           CudaRadixCiphertextFFI *lwe_array, uint32_t n,
+                           int_logical_scalar_shift_buffer<Torus> *mem,
+                           void *const *bsks, Torus *const *ksks) {

  auto num_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
@@ -68,7 +69,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
        lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);

@@ -92,7 +93,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
        lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -1,6 +1,6 @@
 #include "scalar_shifts.cuh"

-uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
+uint64_t scratch_cuda_logical_scalar_shift_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,7 +14,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
+  return scratch_cuda_logical_scalar_shift<uint64_t>(
      CudaStreams(streams),
      (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      shift_type, allocate_gpu_memory);
@@ -24,17 +24,19 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
 /// for the left scalar shift. It is constituted of a rotation, followed by
 /// the application of a PBS onto the rotated blocks up to num_blocks -
 /// rotations - 1 The remaining blocks are padded with zeros
-void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
+                                          CudaRadixCiphertextFFI *lwe_array,
+                                          uint32_t shift, int8_t *mem_ptr,
+                                          void *const *bsks,
+                                          void *const *ksks) {

-  host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
+  host_logical_scalar_shift_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, shift,
      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
      (uint64_t **)(ksks), lwe_array->num_radix_blocks);
 }

-uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
+uint64_t scratch_cuda_arithmetic_scalar_shift_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -48,7 +50,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
+  return scratch_cuda_arithmetic_scalar_shift<uint64_t>(
      CudaStreams(streams),
      (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
      params, shift_type, allocate_gpu_memory);
@@ -61,18 +63,20 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
 /// sign block, and a second PBS is also applied to it to compute the padding
 /// block, which is copied onto all remaining blocks instead of padding with
 /// zeros as would be done in the logical shift.
-void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
+                                             CudaRadixCiphertextFFI *lwe_array,
+                                             uint32_t shift, int8_t *mem_ptr,
+                                             void *const *bsks,
+                                             void *const *ksks) {

-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
+  host_arithmetic_scalar_shift_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, shift,
      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
      (uint64_t **)(ksks));
 }

-void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
-                                                     int8_t **mem_ptr_void) {
+void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
+                                       int8_t **mem_ptr_void) {

  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
@@ -82,8 +86,8 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
  *mem_ptr_void = nullptr;
 }

-void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
-                                                        int8_t **mem_ptr_void) {
+void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
+                                          int8_t **mem_ptr_void) {

  int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -10,7 +10,7 @@
 #include "pbs/programmable_bootstrap_multibit.cuh"

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
+__host__ uint64_t scratch_cuda_logical_scalar_shift(
    CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -23,7 +23,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
+__host__ void host_logical_scalar_shift_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
    int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
    Torus *const *ksks, uint32_t num_blocks) {
@@ -75,7 +75,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    size_t partial_block_count = num_blocks - rotations;

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, &partial_current_blocks, &partial_current_blocks,
        &partial_previous_blocks, bsks, ksks, lut_bivariate,
        partial_block_count, lut_bivariate->params.message_modulus);
@@ -106,7 +106,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    size_t partial_block_count = num_blocks - rotations;

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, partial_current_blocks, partial_current_blocks,
        &partial_next_blocks, bsks, ksks, lut_bivariate, partial_block_count,
        lut_bivariate->params.message_modulus);
@@ -114,7 +114,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
+__host__ uint64_t scratch_cuda_arithmetic_scalar_shift(
    CudaStreams streams, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -127,7 +127,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
+__host__ void host_arithmetic_scalar_shift_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
    int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
    Torus *const *ksks) {
@@ -197,7 +197,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
        size_t partial_block_count = num_blocks - rotations;
        auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

-        integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        integer_radix_apply_bivariate_lookup_table<Torus>(
            streams, partial_current_blocks, partial_current_blocks,
            &partial_next_blocks, bsks, ksks, lut_bivariate,
            partial_block_count, lut_bivariate->params.message_modulus);
@@ -207,7 +207,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      streams.synchronize();
      auto lut_univariate_padding_block =
          mem->lut_buffers_univariate[num_bits_in_block - 1];
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table<Torus>(
          mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
          lut_univariate_padding_block, 1);
      // Replace blocks 'pulled' from the left with the correct padding
@@ -221,7 +221,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      if (shift_within_block != 0) {
        auto lut_univariate_shift_last_block =
            mem->lut_buffers_univariate[shift_within_block - 1];
-        integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        integer_radix_apply_univariate_lookup_table<Torus>(
            mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
            lut_univariate_shift_last_block, 1);
      }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -1,6 +1,6 @@
 #include "shift_and_rotate.cuh"

-uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
+uint64_t scratch_cuda_shift_and_rotate_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,24 +14,25 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus, noise_reduction_type);

-  return scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
+  return scratch_cuda_shift_and_rotate<uint64_t>(
      CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
      num_blocks, params, shift_type, is_signed, allocate_gpu_memory);
 }

-void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
-    CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *lwe_array,
+                                      CudaRadixCiphertextFFI const *lwe_shift,
+                                      int8_t *mem_ptr, void *const *bsks,
+                                      void *const *ksks) {

-  host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
+  host_shift_and_rotate_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, lwe_shift,
      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
      (uint64_t **)(ksks));
 }

-void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
-                                                 int8_t **mem_ptr_void) {
+void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
+                                   int8_t **mem_ptr_void) {
  int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
      (int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -11,7 +11,7 @@
 #include "scalar_mul.cuh"

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
+__host__ uint64_t scratch_cuda_shift_and_rotate(
    CudaStreams streams, int_shift_and_rotate_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
@@ -23,11 +23,12 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
 }

 template <typename Torus>
-__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
-    CudaRadixCiphertextFFI const *lwe_shift,
-    int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+__host__ void
+host_shift_and_rotate_inplace(CudaStreams streams,
+                              CudaRadixCiphertextFFI *lwe_array,
+                              CudaRadixCiphertextFFI const *lwe_shift,
+                              int_shift_and_rotate_buffer<Torus> *mem,
+                              void *const *bsks, Torus *const *ksks) {
  cuda_set_device(streams.gpu_index(0));

  if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
@@ -158,7 +159,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // we have
    // control_bit|b|a
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, input_bits_a, mux_inputs, bsks, ksks, mux_lut, total_nb_bits);
  }

@@ -190,7 +191,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, lwe_array, lwe_array, bsks, ksks, cleaning_lut,
        num_radix_blocks);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
@@ -1,6 +1,6 @@
 #include "subtraction.cuh"

-uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -19,7 +19,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
      num_blocks, params, requested_flag, allocate_gpu_memory);
 }

-void cuda_sub_and_propagate_single_carry_kb_64_inplace(
+void cuda_sub_and_propagate_single_carry_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -36,9 +36,9 @@ void host_sub_and_propagate_single_carry(
    int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
    uint32_t requested_flag, uint32_t uses_carry) {

-  host_integer_radix_negation<Torus>(
-      streams, mem->neg_rhs_array, rhs_array, mem->params.message_modulus,
-      mem->params.carry_modulus, mem->neg_rhs_array->num_radix_blocks);
+  host_negation<Torus>(streams, mem->neg_rhs_array, rhs_array,
+                       mem->params.message_modulus, mem->params.carry_modulus,
+                       mem->neg_rhs_array->num_radix_blocks);

  host_add_and_propagate_single_carry<Torus>(
      streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
@@ -46,11 +46,12 @@ void host_sub_and_propagate_single_carry(
 }

 template <typename Torus>
-__host__ void host_integer_radix_subtraction(
-    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in_1,
-    CudaRadixCiphertextFFI const *lwe_array_in_2, uint64_t message_modulus,
-    uint64_t carry_modulus, uint32_t num_radix_blocks) {
+__host__ void host_subtraction(CudaStreams streams,
+                               CudaRadixCiphertextFFI *lwe_array_out,
+                               CudaRadixCiphertextFFI const *lwe_array_in_1,
+                               CudaRadixCiphertextFFI const *lwe_array_in_2,
+                               uint64_t message_modulus, uint64_t carry_modulus,
+                               uint32_t num_radix_blocks) {
  cuda_set_device(streams.gpu_index(0));

  if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
@@ -64,16 +65,15 @@ __host__ void host_integer_radix_subtraction(
    PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
          "the same")

-  host_integer_radix_negation<Torus>(streams, lwe_array_out, lwe_array_in_2,
-                                     message_modulus, carry_modulus,
-                                     num_radix_blocks);
+  host_negation<Torus>(streams, lwe_array_out, lwe_array_in_2, message_modulus,
+                       carry_modulus, num_radix_blocks);
  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), lwe_array_out,
                       lwe_array_out, lwe_array_in_1, num_radix_blocks,
                       message_modulus, carry_modulus);
 }

 template <typename Torus>
-__host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
+__host__ uint64_t scratch_cuda_integer_overflowing_sub(
    CudaStreams streams, int_overflowing_sub_memory<Torus> **mem_ptr,
    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type) {
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -94,7 +94,7 @@ __host__ void host_expand_without_verification(
  into_radix_ciphertext(output, lwe_array_out, 2 * num_lwes, lwe_dimension);
  auto input = new CudaRadixCiphertextFFI;
  into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table<Torus>(
      streams, output, input, bsks, ksks, message_and_carry_extract_luts,
      2 * num_lwes);
 }
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -302,7 +302,7 @@ const _: () = {
        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
 };
 unsafe extern "C" {
-    pub fn scratch_cuda_apply_univariate_lut_kb_64(
+    pub fn scratch_cuda_apply_univariate_lut_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        input_lut: *const ffi::c_void,
@@ -324,7 +324,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
+    pub fn scratch_cuda_apply_many_univariate_lut_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        input_lut: *const ffi::c_void,
@@ -347,7 +347,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_apply_univariate_lut_kb_64(
+    pub fn cuda_apply_univariate_lut_64(
        streams: CudaStreamsFFI,
        output_radix_lwe: *mut CudaRadixCiphertextFFI,
        input_radix_lwe: *const CudaRadixCiphertextFFI,
@@ -357,13 +357,13 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_apply_univariate_lut_kb_64(
+    pub fn cleanup_cuda_apply_univariate_lut_64(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_apply_bivariate_lut_kb_64(
+    pub fn scratch_cuda_apply_bivariate_lut_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        input_lut: *const ffi::c_void,
@@ -385,7 +385,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_apply_bivariate_lut_kb_64(
+    pub fn cuda_apply_bivariate_lut_64(
        streams: CudaStreamsFFI,
        output_radix_lwe: *mut CudaRadixCiphertextFFI,
        input_radix_lwe_1: *const CudaRadixCiphertextFFI,
@@ -398,13 +398,10 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
+    pub fn cleanup_cuda_apply_bivariate_lut_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn cuda_apply_many_univariate_lut_kb_64(
+    pub fn cuda_apply_many_univariate_lut_64(
        streams: CudaStreamsFFI,
        output_radix_lwe: *mut CudaRadixCiphertextFFI,
        input_radix_lwe: *const CudaRadixCiphertextFFI,
@@ -448,7 +445,7 @@ unsafe extern "C" {
    pub fn cleanup_cuda_full_propagation(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
+    pub fn scratch_cuda_integer_mult_radix_ciphertext_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        is_boolean_left: bool,
@@ -470,7 +467,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_mult_radix_ciphertext_kb_64(
+    pub fn cuda_integer_mult_radix_ciphertext_64(
        streams: CudaStreamsFFI,
        radix_lwe_out: *mut CudaRadixCiphertextFFI,
        radix_lwe_left: *const CudaRadixCiphertextFFI,
@@ -488,7 +485,7 @@ unsafe extern "C" {
    pub fn cleanup_cuda_integer_mult(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn cuda_negate_integer_radix_ciphertext_64(
+    pub fn cuda_negate_ciphertext_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -498,7 +495,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
+    pub fn cuda_scalar_addition_ciphertext_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        scalar_input: *const ffi::c_void,
@@ -509,7 +506,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
+    pub fn scratch_cuda_logical_scalar_shift_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -531,7 +528,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
+    pub fn cuda_logical_scalar_shift_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        shift: u32,
@@ -541,7 +538,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
+    pub fn scratch_cuda_arithmetic_scalar_shift_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -563,7 +560,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
+    pub fn cuda_arithmetic_scalar_shift_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        shift: u32,
@@ -573,19 +570,16 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_logical_scalar_shift(
+    pub fn cleanup_cuda_logical_scalar_shift(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_arithmetic_scalar_shift(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64(
+    pub fn scratch_cuda_shift_and_rotate_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -608,7 +602,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace(
+    pub fn cuda_shift_and_rotate_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        lwe_shift: *const CudaRadixCiphertextFFI,
@@ -618,13 +612,10 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_shift_and_rotate(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
+    pub fn cleanup_cuda_shift_and_rotate(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_comparison_kb_64(
+    pub fn scratch_cuda_comparison_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -647,7 +638,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
+    pub fn cuda_comparison_ciphertext_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_1: *const CudaRadixCiphertextFFI,
@@ -658,7 +649,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
+    pub fn cuda_scalar_comparison_ciphertext_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -674,7 +665,7 @@ unsafe extern "C" {
    pub fn cleanup_cuda_integer_comparison(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_bitop_kb_64(
+    pub fn scratch_cuda_bitop_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -696,7 +687,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
+    pub fn cuda_bitop_ciphertext_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_1: *const CudaRadixCiphertextFFI,
@@ -707,7 +698,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
+    pub fn cuda_scalar_bitop_ciphertext_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_input: *const CudaRadixCiphertextFFI,
@@ -723,7 +714,7 @@ unsafe extern "C" {
    pub fn cleanup_cuda_integer_bitop(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_cmux_kb_64(
+    pub fn scratch_cuda_cmux_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -744,7 +735,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
+    pub fn cuda_cmux_ciphertext_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_condition: *const CudaRadixCiphertextFFI,
@@ -756,10 +747,10 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_cmux(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+    pub fn cleanup_cuda_cmux(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
+    pub fn scratch_cuda_scalar_rotate_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -781,7 +772,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
+    pub fn cuda_scalar_rotate_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        n: u32,
@@ -791,13 +782,10 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_scalar_rotate(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
+    pub fn cleanup_cuda_scalar_rotate(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_propagate_single_carry_kb_64_inplace(
+    pub fn scratch_cuda_propagate_single_carry_64_inplace(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -819,7 +807,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+    pub fn scratch_cuda_add_and_propagate_single_carry_64_inplace(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -841,7 +829,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_propagate_single_carry_kb_64_inplace(
+    pub fn cuda_propagate_single_carry_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        carry_out: *mut CudaRadixCiphertextFFI,
@@ -854,7 +842,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
+    pub fn cuda_add_and_propagate_single_carry_64_inplace(
        streams: CudaStreamsFFI,
        lhs_array: *mut CudaRadixCiphertextFFI,
        rhs_array: *const CudaRadixCiphertextFFI,
@@ -877,7 +865,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+    pub fn scratch_cuda_integer_overflowing_sub_64_inplace(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -899,7 +887,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_overflowing_sub_kb_64_inplace(
+    pub fn cuda_integer_overflowing_sub_64_inplace(
        streams: CudaStreamsFFI,
        lhs_array: *mut CudaRadixCiphertextFFI,
        rhs_array: *const CudaRadixCiphertextFFI,
@@ -919,7 +907,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+    pub fn scratch_cuda_partial_sum_ciphertexts_vec_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -941,7 +929,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+    pub fn cuda_partial_sum_ciphertexts_vec_64(
        streams: CudaStreamsFFI,
        radix_lwe_out: *mut CudaRadixCiphertextFFI,
        radix_lwe_vec: *mut CudaRadixCiphertextFFI,
@@ -951,13 +939,13 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
+    pub fn cleanup_cuda_partial_sum_ciphertexts_vec(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_scalar_mul_kb_64(
+    pub fn scratch_cuda_integer_scalar_mul_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -978,7 +966,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
+    pub fn cuda_scalar_multiplication_ciphertext_64_inplace(
        streams: CudaStreamsFFI,
        lwe_array: *mut CudaRadixCiphertextFFI,
        decomposed_scalar: *const u64,
@@ -992,13 +980,10 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_radix_scalar_mul(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
+    pub fn cleanup_cuda_scalar_mul(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+    pub fn scratch_cuda_integer_div_rem_radix_ciphertext_64(
        streams: CudaStreamsFFI,
        is_signed: bool,
        mem_ptr: *mut *mut i8,
@@ -1020,7 +1005,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
+    pub fn cuda_integer_div_rem_radix_ciphertext_64(
        streams: CudaStreamsFFI,
        quotient: *mut CudaRadixCiphertextFFI,
        remainder: *mut CudaRadixCiphertextFFI,
@@ -1081,7 +1066,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        is_signed: bool,
@@ -1103,7 +1088,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+    pub fn cuda_integer_abs_inplace_radix_ciphertext_64(
        streams: CudaStreamsFFI,
        ct: *mut CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
@@ -1116,7 +1101,7 @@ unsafe extern "C" {
    pub fn cleanup_cuda_integer_abs_inplace(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+    pub fn scratch_cuda_integer_are_all_comparisons_block_true_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1137,7 +1122,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
+    pub fn cuda_integer_are_all_comparisons_block_true_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -1154,7 +1139,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+    pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1175,7 +1160,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+    pub fn cuda_integer_is_at_least_one_comparisons_block_true_64(
        streams: CudaStreamsFFI,
        lwe_array_out: *mut CudaRadixCiphertextFFI,
        lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -1206,7 +1191,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_apply_noise_squashing_kb(
+    pub fn scratch_cuda_apply_noise_squashing(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        lwe_dimension: u32,
@@ -1229,7 +1214,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_apply_noise_squashing_kb(
+    pub fn cuda_apply_noise_squashing(
        streams: CudaStreamsFFI,
        output_radix_lwe: *mut CudaRadixCiphertextFFI,
        input_radix_lwe: *const CudaRadixCiphertextFFI,
@@ -1239,13 +1224,10 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_apply_noise_squashing_kb(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
+    pub fn cleanup_cuda_apply_noise_squashing(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    pub fn scratch_cuda_sub_and_propagate_single_carry_64_inplace(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1267,7 +1249,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    pub fn cuda_sub_and_propagate_single_carry_64_inplace(
        streams: CudaStreamsFFI,
        lhs_array: *mut CudaRadixCiphertextFFI,
        rhs_array: *const CudaRadixCiphertextFFI,
@@ -1287,7 +1269,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
+    pub fn scratch_cuda_integer_unsigned_scalar_div_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1308,7 +1290,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_unsigned_scalar_div_radix_kb_64(
+    pub fn cuda_integer_unsigned_scalar_div_radix_64(
        streams: CudaStreamsFFI,
        numerator_ct: *mut CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
@@ -1318,7 +1300,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
+    pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
@@ -1362,7 +1344,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+    pub fn scratch_cuda_integer_signed_scalar_div_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1383,7 +1365,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_signed_scalar_div_radix_kb_64(
+    pub fn cuda_integer_signed_scalar_div_radix_64(
        streams: CudaStreamsFFI,
        numerator_ct: *mut CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
@@ -1394,13 +1376,13 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
+    pub fn cleanup_cuda_integer_signed_scalar_div_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
-    pub fn scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+    pub fn scratch_integer_unsigned_scalar_div_rem_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1422,7 +1404,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    pub fn cuda_integer_unsigned_scalar_div_rem_radix_64(
        streams: CudaStreamsFFI,
        quotient_ct: *mut CudaRadixCiphertextFFI,
        remainder_ct: *mut CudaRadixCiphertextFFI,
@@ -1439,13 +1421,13 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
-    pub fn scratch_integer_signed_scalar_div_rem_radix_kb_64(
+    pub fn scratch_integer_signed_scalar_div_rem_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1467,7 +1449,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    pub fn cuda_integer_signed_scalar_div_rem_radix_64(
        streams: CudaStreamsFFI,
        quotient_ct: *mut CudaRadixCiphertextFFI,
        remainder_ct: *mut CudaRadixCiphertextFFI,
@@ -1482,13 +1464,13 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
 }
 unsafe extern "C" {
-    pub fn scratch_integer_count_of_consecutive_bits_kb_64(
+    pub fn scratch_integer_count_of_consecutive_bits_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1511,7 +1493,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_count_of_consecutive_bits_kb_64(
+    pub fn cuda_integer_count_of_consecutive_bits_64(
        streams: CudaStreamsFFI,
        output_ct: *mut CudaRadixCiphertextFFI,
        input_ct: *const CudaRadixCiphertextFFI,
@@ -1521,7 +1503,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
+    pub fn cleanup_cuda_integer_count_of_consecutive_bits_64(
        streams: CudaStreamsFFI,
        mem_ptr_void: *mut *mut i8,
    );
@@ -1549,7 +1531,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_grouped_oprf_async_64(
+    pub fn cuda_integer_grouped_oprf_64(
        streams: CudaStreamsFFI,
        radix_lwe_out: *mut CudaRadixCiphertextFFI,
        seeded_lwe_input: *const ffi::c_void,
@@ -1565,7 +1547,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn scratch_integer_ilog2_kb_64(
+    pub fn scratch_integer_ilog2_64(
        streams: CudaStreamsFFI,
        mem_ptr: *mut *mut i8,
        glwe_dimension: u32,
@@ -1587,7 +1569,7 @@ unsafe extern "C" {
    ) -> u64;
 }
 unsafe extern "C" {
-    pub fn cuda_integer_ilog2_kb_64(
+    pub fn cuda_integer_ilog2_64(
        streams: CudaStreamsFFI,
        output_ct: *mut CudaRadixCiphertextFFI,
        input_ct: *const CudaRadixCiphertextFFI,
@@ -1600,7 +1582,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cleanup_cuda_integer_ilog2_kb_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+    pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
 unsafe extern "C" {
    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
--- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs
@@ -11,7 +11,7 @@ use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextIn
 use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaVec, KsType, LweDimension};
 use crate::integer::gpu::key_switching_key::CudaKeySwitchingKey;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
-use crate::integer::gpu::{expand_async, PBSType};
+use crate::integer::gpu::{cuda_backend_expand, PBSType};
 use crate::shortint::ciphertext::CompactCiphertextList;
 use crate::shortint::parameters::{
    CompactCiphertextListExpansionKind, Degree, LweBskGroupingFactor, NoiseLevel,
@@ -409,7 +409,7 @@ impl CudaFlattenedVecCompactCiphertextList {

            match &sks.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    expand_async(
+                    cuda_backend_expand(
                        streams,
                        &mut d_output,
                        d_input,
@@ -444,7 +444,7 @@ impl CudaFlattenedVecCompactCiphertextList {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    expand_async(
+                    cuda_backend_expand(
                        streams,
                        &mut d_output,
                        d_input,
--- a/tfhe/src/integer/gpu/ciphertext/compressed_noise_squashed_ciphertext_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_noise_squashed_ciphertext_list.rs
@@ -12,7 +12,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::{
    CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
    CudaSquashedNoiseSignedRadixCiphertext,
 };
-use crate::integer::gpu::decompress_integer_radix_async_128;
+use crate::integer::gpu::cuda_backend_decompress_128;
 use crate::integer::gpu::list_compression::server_keys::{
    CudaNoiseSquashingCompressionKey, CudaPackedGlweCiphertextList,
 };
@@ -345,7 +345,7 @@ impl CudaCompressedSquashedNoiseCiphertextList {
        );

        unsafe {
-            decompress_integer_radix_async_128(
+            cuda_backend_decompress_128(
                streams,
                &mut output_lwe,
                &self.packed_list,
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -15,8 +15,8 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
 use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
-    get_compression_size_on_gpu, get_decompression_size_on_gpu,
+    cuda_backend_compress, cuda_backend_decompress, cuda_backend_get_compression_size_on_gpu,
+    cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu,
 };
 use crate::prelude::CastInto;
 use crate::shortint::ciphertext::{
@@ -322,7 +322,7 @@ impl CudaCompressionKey {
        unsafe {
            let input_lwes = Self::flatten_async(ciphertexts, streams);

-            compress_integer_radix_async(
+            cuda_backend_compress(
                streams,
                &mut glwe_array_out,
                &input_lwes,
@@ -355,7 +355,7 @@ impl CudaCompressionKey {
        let compressed_polynomial_size = lwe_pksk.output_polynomial_size();
        let compressed_glwe_size = lwe_pksk.output_glwe_size();

-        get_compression_size_on_gpu(
+        cuda_backend_get_compression_size_on_gpu(
            streams,
            message_modulus,
            carry_modulus,
@@ -430,7 +430,7 @@ impl CudaDecompressionKey {
                );

                unsafe {
-                    decompress_integer_radix_async_64(
+                    cuda_backend_decompress(
                        streams,
                        &mut output_lwe,
                        packed_list,
@@ -515,7 +515,7 @@ impl CudaDecompressionKey {
                );
                let lwe_dimension = bsk.output_lwe_dimension();

-                get_decompression_size_on_gpu(
+                cuda_backend_get_decompression_size_on_gpu(
                    streams,
                    message_modulus,
                    carry_modulus,
@@ -570,7 +570,7 @@ impl CudaDecompressionKey {
                );
                let lwe_dimension = bsk.output_lwe_dimension();

-                get_decompression_size_on_gpu(
+                cuda_backend_get_decompression_size_on_gpu(
                    streams,
                    message_modulus,
                    carry_modulus,
@@ -712,7 +712,7 @@ impl CudaNoiseSquashingCompressionKey {
        unsafe {
            let input_lwes = Self::flatten_async(ciphertexts, streams);

-            compress_integer_radix_async(
+            cuda_backend_compress(
                streams,
                &mut glwe_array_out,
                &input_lwes,
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
--- a/tfhe/src/integer/gpu/server_key/radix/abs.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/abs.rs
@@ -2,7 +2,7 @@ use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
-use crate::integer::gpu::{unchecked_signed_abs_radix_kb_assign_async, PBSType};
+use crate::integer::gpu::{cuda_backend_unchecked_signed_abs_assign, PBSType};

 impl CudaServerKey {
    /// # Safety
@@ -18,7 +18,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    unchecked_signed_abs_radix_kb_assign_async(
+                    cuda_backend_unchecked_signed_abs_assign(
                        streams,
                        ct.as_mut(),
                        &d_bsk.d_vec,
@@ -44,7 +44,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    unchecked_signed_abs_radix_kb_assign_async(
+                    cuda_backend_unchecked_signed_abs_assign(
                        streams,
                        ct.as_mut(),
                        &d_multibit_bsk.d_vec,
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -7,10 +7,10 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    add_and_propagate_single_carry_assign_async,
-    get_add_and_propagate_single_carry_assign_async_size_on_gpu,
-    get_full_propagate_assign_size_on_gpu, unchecked_add_integer_radix_assign_async,
-    unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
+    cuda_backend_add_and_propagate_single_carry_assign,
+    cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu,
+    cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_unchecked_add_assign,
+    cuda_backend_unchecked_partial_sum_ciphertexts_assign, PBSType,
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::NoiseLevel;
@@ -153,23 +153,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count().0
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -199,7 +201,7 @@ impl CudaServerKey {
        let num_blocks = ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
        let add_assign_mem = match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                get_add_and_propagate_single_carry_assign_async_size_on_gpu(
+                cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu(
                    streams,
                    d_bsk.input_lwe_dimension(),
                    d_bsk.glwe_dimension(),
@@ -218,7 +220,7 @@ impl CudaServerKey {
                )
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_add_and_propagate_single_carry_assign_async_size_on_gpu(
+                cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -313,7 +315,7 @@ impl CudaServerKey {
        );

        unsafe {
-            unchecked_add_integer_radix_assign_async(streams, ciphertext_left, ciphertext_right);
+            cuda_backend_unchecked_add_assign(streams, ciphertext_left, ciphertext_right);
        }
    }

@@ -373,7 +375,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_partial_sum_ciphertexts_assign(
                    streams,
                    result.as_mut(),
                    &mut terms,
@@ -399,7 +401,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_partial_sum_ciphertexts_assign(
                    streams,
                    result.as_mut(),
                    &mut terms,
@@ -833,7 +835,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                add_and_propagate_single_carry_assign_async(
+                cuda_backend_add_and_propagate_single_carry_assign(
                    streams,
                    lhs.as_mut(),
                    rhs.as_ref(),
@@ -859,7 +861,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                add_and_propagate_single_carry_assign_async(
+                cuda_backend_add_and_propagate_single_carry_assign(
                    streams,
                    lhs.as_mut(),
                    rhs.as_ref(),
--- a/tfhe/src/integer/gpu/server_key/radix/aes.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/aes.rs
@@ -6,9 +6,9 @@ use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};

 use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::{
-    get_aes_ctr_encrypt_integer_radix_size_on_gpu, get_key_expansion_integer_radix_size_on_gpu,
-    unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async,
-    unchecked_key_expansion_integer_radix_kb_assign_async, PBSType,
+    cuda_backend_aes_key_expansion, cuda_backend_get_aes_ctr_encrypt_size_on_gpu,
+    cuda_backend_get_aes_key_expansion_size_on_gpu, cuda_backend_unchecked_aes_ctr_encrypt,
+    PBSType,
 };
 use crate::integer::{RadixCiphertext, RadixClientKey};
 use crate::shortint::Ciphertext;
@@ -231,7 +231,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_aes_ctr_encrypt(
                    streams,
                    result.as_mut(),
                    iv.as_ref(),
@@ -256,7 +256,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_aes_ctr_encrypt(
                    streams,
                    result.as_mut(),
                    iv.as_ref(),
@@ -308,7 +308,7 @@ impl CudaServerKey {
        streams: &CudaStreams,
    ) -> u64 {
        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_aes_ctr_encrypt_integer_radix_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
                streams,
                num_aes_inputs as u32,
                sbox_parallelism as u32,
@@ -326,7 +326,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_aes_ctr_encrypt_integer_radix_size_on_gpu(
+                cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
                    streams,
                    num_aes_inputs as u32,
                    sbox_parallelism as u32,
@@ -371,7 +371,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_key_expansion_integer_radix_kb_assign_async(
+                cuda_backend_aes_key_expansion(
                    streams,
                    expanded_keys.as_mut(),
                    key.as_ref(),
@@ -392,7 +392,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_key_expansion_integer_radix_kb_assign_async(
+                cuda_backend_aes_key_expansion(
                    streams,
                    expanded_keys.as_mut(),
                    key.as_ref(),
@@ -428,7 +428,7 @@ impl CudaServerKey {
    ///   synchronization is required
    unsafe fn get_key_expansion_size_on_gpu_async(&self, streams: &CudaStreams) -> u64 {
        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_key_expansion_integer_radix_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_aes_key_expansion_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -444,7 +444,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_key_expansion_integer_radix_size_on_gpu(
+                cuda_backend_get_aes_key_expansion_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -9,8 +9,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_bitop_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
-    unchecked_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
+    cuda_backend_get_bitop_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_unchecked_bitop_assign, BitOpType, CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -209,7 +209,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_bitop_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_bitop_assign(
                    streams,
                    ct_left.as_mut(),
                    ct_right.as_ref(),
@@ -237,7 +237,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_bitop_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_bitop_assign(
                    streams,
                    ct_left.as_mut(),
                    ct_right.as_ref(),
@@ -283,23 +283,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -329,7 +331,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

        let bitop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_bitop_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_bitop_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -351,30 +353,28 @@ impl CudaServerKey {
                LweBskGroupingFactor(0),
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_bitop_integer_radix_kb_size_on_gpu(
-                    streams,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    op,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                    None,
-                )
-            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_bitop_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_multibit_bsk.glwe_dimension,
+                d_multibit_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_multibit_bsk.decomp_level_count,
+                d_multibit_bsk.decomp_base_log,
+                op,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::MultiBit,
+                d_multibit_bsk.grouping_factor,
+                None,
+            ),
        };
        actual_full_prop_mem.max(bitop_mem)
    }
@@ -938,23 +938,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
--- a/tfhe/src/integer/gpu/server_key/radix/cmux.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/cmux.rs
@@ -4,8 +4,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_cmux_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
-    unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType,
+    cuda_backend_get_cmux_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_unchecked_cmux, CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -27,7 +27,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    unchecked_cmux_integer_radix_kb_async(
+                    cuda_backend_unchecked_cmux(
                        stream,
                        result.as_mut(),
                        condition,
@@ -56,7 +56,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    unchecked_cmux_integer_radix_kb_async(
+                    cuda_backend_unchecked_cmux(
                        stream,
                        result.as_mut(),
                        condition,
@@ -150,23 +150,25 @@ impl CudaServerKey {
            false_ct.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -196,7 +198,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = true_ct.as_ref().d_blocks.lwe_ciphertext_count();

        let cmux_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_cmux_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_cmux_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -217,29 +219,27 @@ impl CudaServerKey {
                LweBskGroupingFactor(0),
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_cmux_integer_radix_kb_size_on_gpu(
-                    streams,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                    None,
-                )
-            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_cmux_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_multibit_bsk.glwe_dimension,
+                d_multibit_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_multibit_bsk.decomp_level_count,
+                d_multibit_bsk.decomp_base_log,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::MultiBit,
+                d_multibit_bsk.grouping_factor,
+                None,
+            ),
        };
        actual_full_prop_mem.max(cmux_mem)
    }
--- a/tfhe/src/integer/gpu/server_key/radix/comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/comparison.rs
@@ -6,8 +6,8 @@ use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaRadixCiphertext};
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_comparison_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
-    unchecked_comparison_integer_radix_kb_async, ComparisonType, CudaServerKey, PBSType,
+    cuda_backend_get_comparison_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_unchecked_comparison, ComparisonType, CudaServerKey, PBSType,
 };
 use crate::shortint::ciphertext::Degree;

@@ -51,7 +51,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_comparison(
                    streams,
                    result.as_mut().as_mut(),
                    ct_left.as_ref(),
@@ -80,7 +80,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_comparison(
                    streams,
                    result.as_mut().as_mut(),
                    ct_left.as_ref(),
@@ -365,23 +365,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -411,7 +413,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

        let comparison_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_comparison_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_comparison_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -435,7 +437,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_comparison_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_comparison_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
@@ -1131,7 +1133,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_comparison(
                    streams,
                    result.as_mut(),
                    ct_left.as_ref(),
@@ -1160,7 +1162,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_comparison(
                    streams,
                    result.as_mut(),
                    ct_left.as_ref(),
@@ -1227,7 +1229,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_comparison(
                    streams,
                    result.as_mut(),
                    ct_left.as_ref(),
@@ -1256,7 +1258,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_comparison(
                    streams,
                    result.as_mut(),
                    ct_left.as_ref(),
--- a/tfhe/src/integer/gpu/server_key/radix/div_mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/div_mod.rs
@@ -3,8 +3,8 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    get_div_rem_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
-    unchecked_div_rem_integer_radix_kb_assign_async, PBSType,
+    cuda_backend_get_div_rem_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_unchecked_div_rem_assign, PBSType,
 };

 impl CudaServerKey {
@@ -26,7 +26,7 @@ impl CudaServerKey {
        let num_blocks = divisor.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_div_rem_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_div_rem_assign(
                    streams,
                    quotient.as_mut(),
                    remainder.as_mut(),
@@ -56,7 +56,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_div_rem_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_div_rem_assign(
                    streams,
                    quotient.as_mut(),
                    remainder.as_mut(),
@@ -258,23 +258,25 @@ impl CudaServerKey {
            divisor.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -304,7 +306,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = numerator.as_ref().d_blocks.lwe_ciphertext_count();

        let mul_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_div_rem_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_div_rem_size_on_gpu(
                streams,
                T::IS_SIGNED,
                self.message_modulus,
@@ -326,30 +328,28 @@ impl CudaServerKey {
                LweBskGroupingFactor(0),
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_div_rem_integer_radix_kb_size_on_gpu(
-                    streams,
-                    T::IS_SIGNED,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                    None,
-                )
-            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_div_rem_size_on_gpu(
+                streams,
+                T::IS_SIGNED,
+                self.message_modulus,
+                self.carry_modulus,
+                d_multibit_bsk.glwe_dimension,
+                d_multibit_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_multibit_bsk.decomp_level_count,
+                d_multibit_bsk.decomp_base_log,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::MultiBit,
+                d_multibit_bsk.grouping_factor,
+                None,
+            ),
        };
        actual_full_prop_mem.max(mul_mem)
    }
--- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
@@ -5,7 +5,7 @@ use crate::integer::gpu::ciphertext::{
    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
 };
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
-use crate::integer::gpu::{count_of_consecutive_bits_async, ilog2_async, PBSType};
+use crate::integer::gpu::{cuda_backend_count_of_consecutive_bits, cuda_backend_ilog2, PBSType};
 use crate::integer::server_key::radix_parallel::ilog2::{BitValue, Direction};

 impl CudaServerKey {
@@ -40,7 +40,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                count_of_consecutive_bits_async(
+                cuda_backend_count_of_consecutive_bits(
                    streams,
                    result.as_mut(),
                    ct.as_ref(),
@@ -63,7 +63,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                count_of_consecutive_bits_async(
+                cuda_backend_count_of_consecutive_bits(
                    streams,
                    result.as_mut(),
                    ct.as_ref(),
@@ -279,7 +279,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                ilog2_async(
+                cuda_backend_ilog2(
                    streams,
                    result.as_mut(),
                    ct.as_ref(),
@@ -306,7 +306,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                ilog2_async(
+                cuda_backend_ilog2(
                    streams,
                    result.as_mut(),
                    ct.as_ref(),
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -16,11 +16,12 @@ use crate::integer::gpu::ciphertext::{
 use crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    apply_bivariate_lut_kb_async, apply_many_univariate_lut_kb_async,
-    apply_univariate_lut_kb_async, compute_prefix_sum_hillis_steele_async,
-    extend_radix_with_sign_msb_async, extend_radix_with_trivial_zero_blocks_msb_async,
-    full_propagate_assign_async, noise_squashing_async, propagate_single_carry_assign_async,
-    trim_radix_blocks_lsb_async, CudaServerKey, PBSType,
+    cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut,
+    cuda_backend_apply_univariate_lut, cuda_backend_compute_prefix_sum_hillis_steele,
+    cuda_backend_extend_radix_with_sign_msb,
+    cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign,
+    cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign,
+    cuda_backend_trim_radix_blocks_lsb, CudaServerKey, PBSType,
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
@@ -239,7 +240,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                propagate_single_carry_assign_async(
+                cuda_backend_propagate_single_carry_assign(
                    streams,
                    ciphertext,
                    carry_out.as_mut(),
@@ -264,7 +265,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                propagate_single_carry_assign_async(
+                cuda_backend_propagate_single_carry_assign(
                    streams,
                    ciphertext,
                    carry_out.as_mut(),
@@ -302,7 +303,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    full_propagate_assign_async(
+                    cuda_backend_full_propagate_assign(
                        streams,
                        ciphertext,
                        &d_bsk.d_vec,
@@ -323,7 +324,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    full_propagate_assign_async(
+                    cuda_backend_full_propagate_assign(
                        streams,
                        ciphertext,
                        &d_multibit_bsk.d_vec,
@@ -507,7 +508,11 @@ impl CudaServerKey {
        };

        unsafe {
-            extend_radix_with_trivial_zero_blocks_msb_async(output.as_mut(), ct.as_ref(), streams);
+            cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
+                output.as_mut(),
+                ct.as_ref(),
+                streams,
+            );
        }
        output
    }
@@ -581,7 +586,7 @@ impl CudaServerKey {
            unsafe { self.create_trivial_zero_radix_async(output_num_blocks, streams) };

        unsafe {
-            trim_radix_blocks_lsb_async(output.as_mut(), ct.as_ref(), streams);
+            cuda_backend_trim_radix_blocks_lsb(output.as_mut(), ct.as_ref(), streams);
        }

        output
@@ -791,7 +796,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    apply_univariate_lut_kb_async(
+                    cuda_backend_apply_univariate_lut(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -819,7 +824,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    apply_univariate_lut_kb_async(
+                    cuda_backend_apply_univariate_lut(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -909,7 +914,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    apply_bivariate_lut_kb_async(
+                    cuda_backend_apply_bivariate_lut(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -939,7 +944,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    apply_bivariate_lut_kb_async(
+                    cuda_backend_apply_bivariate_lut(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -1088,7 +1093,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                apply_many_univariate_lut_kb_async(
+                cuda_backend_apply_many_univariate_lut(
                    streams,
                    &mut output_slice,
                    &mut output_degrees,
@@ -1118,7 +1123,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                apply_many_univariate_lut_kb_async(
+                cuda_backend_apply_many_univariate_lut(
                    streams,
                    &mut output_slice,
                    &mut output_degrees,
@@ -1229,7 +1234,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    compute_prefix_sum_hillis_steele_async(
+                    cuda_backend_compute_prefix_sum_hillis_steele(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -1259,7 +1264,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    compute_prefix_sum_hillis_steele_async(
+                    cuda_backend_compute_prefix_sum_hillis_steele(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -1324,7 +1329,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                extend_radix_with_sign_msb_async(
+                cuda_backend_extend_radix_with_sign_msb(
                    streams,
                    output.as_mut(),
                    ct.as_ref(),
@@ -1346,7 +1351,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                extend_radix_with_sign_msb_async(
+                cuda_backend_extend_radix_with_sign_msb(
                    streams,
                    output.as_mut(),
                    ct.as_ref(),
@@ -1638,7 +1643,7 @@ impl CudaServerKey {
        unsafe {
            match &d_bootstrapping_key {
                CudaBootstrappingKey::Classic(bsk) => {
-                    noise_squashing_async(
+                    cuda_backend_noise_squashing(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
@@ -1667,7 +1672,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(mb_bsk) => {
-                    noise_squashing_async(
+                    cuda_backend_noise_squashing(
                        streams,
                        &mut output_slice,
                        &mut output_degrees,
--- a/tfhe/src/integer/gpu/server_key/radix/mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mul.rs
@@ -3,8 +3,8 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_mul_integer_radix_kb_size_on_gpu,
-    unchecked_mul_integer_radix_kb_assign_async, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_mul_size_on_gpu,
+    cuda_backend_unchecked_mul_assign, PBSType,
 };

 impl CudaServerKey {
@@ -80,7 +80,7 @@ impl CudaServerKey {
        let is_boolean_right = ct_right.holds_boolean_value();
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_mul_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_mul_assign(
                    streams,
                    ct_left.as_mut(),
                    is_boolean_left,
@@ -104,7 +104,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_mul_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_mul_assign(
                    streams,
                    ct_left.as_mut(),
                    is_boolean_left,
@@ -264,23 +264,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -312,7 +314,7 @@ impl CudaServerKey {
        let is_boolean_right = ct_right.holds_boolean_value();

        let mul_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_mul_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_mul_size_on_gpu(
                streams,
                is_boolean_left,
                is_boolean_right,
@@ -332,7 +334,7 @@ impl CudaServerKey {
                LweBskGroupingFactor(0),
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => get_mul_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_mul_size_on_gpu(
                streams,
                is_boolean_left,
                is_boolean_right,
--- a/tfhe/src/integer/gpu/server_key/radix/neg.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/neg.rs
@@ -3,8 +3,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
 use crate::integer::gpu::ciphertext::{
    CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
 };
+use crate::integer::gpu::cuda_backend_unchecked_negate;
 use crate::integer::gpu::server_key::CudaServerKey;
-use crate::integer::gpu::unchecked_negate_integer_radix_async;
 use crate::integer::server_key::radix_parallel::OutputFlag;

 impl CudaServerKey {
@@ -70,7 +70,7 @@ impl CudaServerKey {

        let info = ctxt.as_ref().info.blocks.first().unwrap();

-        unchecked_negate_integer_radix_async(
+        cuda_backend_unchecked_negate(
            streams,
            ciphertext_out.as_mut(),
            ctxt.as_ref(),
--- a/tfhe/src/integer/gpu/server_key/radix/oprf.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/oprf.rs
@@ -12,7 +12,9 @@ use crate::shortint::oprf::{create_random_from_seed_modulus_switched, raw_seeded

 pub use tfhe_csprng::seeders::{Seed, Seeder};

-use crate::integer::gpu::{get_grouped_oprf_size_on_gpu, grouped_oprf_async, CudaVec, PBSType};
+use crate::integer::gpu::{
+    cuda_backend_get_grouped_oprf_size_on_gpu, cuda_backend_grouped_oprf, CudaVec, PBSType,
+};

 impl CudaServerKey {
    /// Generates an encrypted `num_block` blocks unsigned integer
@@ -372,7 +374,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                grouped_oprf_async(
+                cuda_backend_grouped_oprf(
                    streams,
                    result,
                    &d_seeded_lwe_input,
@@ -395,7 +397,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_bsk) => {
-                grouped_oprf_async(
+                cuda_backend_grouped_oprf(
                    streams,
                    result,
                    &d_seeded_lwe_input,
@@ -429,7 +431,7 @@ impl CudaServerKey {
        let message_bits = self.message_modulus.0.ilog2();

        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_grouped_oprf_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_grouped_oprf_size_on_gpu(
                streams,
                1,
                d_bsk.input_lwe_dimension,
@@ -447,7 +449,7 @@ impl CudaServerKey {
                message_bits,
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
-            CudaBootstrappingKey::MultiBit(d_bsk) => get_grouped_oprf_size_on_gpu(
+            CudaBootstrappingKey::MultiBit(d_bsk) => cuda_backend_get_grouped_oprf_size_on_gpu(
                streams,
                1,
                d_bsk.input_lwe_dimension,
--- a/tfhe/src/integer/gpu/server_key/radix/rotate.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/rotate.rs
@@ -3,10 +3,9 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_rotate_left_integer_radix_kb_size_on_gpu,
-    get_rotate_right_integer_radix_kb_size_on_gpu,
-    unchecked_rotate_left_integer_radix_kb_assign_async,
-    unchecked_rotate_right_integer_radix_kb_assign_async, CudaServerKey, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_rotate_left_size_on_gpu,
+    cuda_backend_get_rotate_right_size_on_gpu, cuda_backend_unchecked_rotate_left_assign,
+    cuda_backend_unchecked_rotate_right_assign, CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -27,7 +26,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_rotate_right_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_rotate_right_assign(
                    streams,
                    ct.as_mut(),
                    rotate.as_ref(),
@@ -55,7 +54,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_rotate_right_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_rotate_right_assign(
                    streams,
                    ct.as_mut(),
                    rotate.as_ref(),
@@ -148,7 +147,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_rotate_left_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_rotate_left_assign(
                    streams,
                    ct.as_mut(),
                    rotate.as_ref(),
@@ -176,7 +175,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_rotate_left_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_rotate_left_assign(
                    streams,
                    ct.as_mut(),
                    rotate.as_ref(),
@@ -574,23 +573,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -620,7 +621,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

        let rotate_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_rotate_left_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_rotate_left_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -643,7 +644,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_rotate_left_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_rotate_left_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
@@ -685,23 +686,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -731,7 +734,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

        let rotate_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_rotate_right_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_rotate_right_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -754,7 +757,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_rotate_right_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_rotate_right_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
@@ -8,8 +8,9 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_propagate_single_carry_assign_async_size_on_gpu,
-    scalar_addition_integer_radix_assign_async, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_get_propagate_single_carry_assign_size_on_gpu,
+    cuda_backend_scalar_addition_assign, PBSType,
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::prelude::CastInto;
@@ -97,7 +98,7 @@ impl CudaServerKey {

            // If the scalar is decomposed using less than the number of blocks our ciphertext
            // has, we just don't touch ciphertext's last blocks
-            scalar_addition_integer_radix_assign_async(
+            cuda_backend_scalar_addition_assign(
                streams,
                ct.as_mut(),
                &d_decomposed_scalar,
@@ -208,23 +209,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -246,7 +249,7 @@ impl CudaServerKey {
        let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
        let single_carry_mem = match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                get_propagate_single_carry_assign_async_size_on_gpu(
+                cuda_backend_get_propagate_single_carry_assign_size_on_gpu(
                    streams,
                    d_bsk.input_lwe_dimension(),
                    d_bsk.glwe_dimension(),
@@ -265,7 +268,7 @@ impl CudaServerKey {
                )
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_propagate_single_carry_assign_async_size_on_gpu(
+                cuda_backend_get_propagate_single_carry_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
@@ -5,8 +5,8 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_scalar_bitop_integer_radix_kb_size_on_gpu,
-    unchecked_scalar_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_scalar_bitop_size_on_gpu,
+    cuda_backend_unchecked_scalar_bitop_assign, BitOpType, CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -36,7 +36,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_bitop_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_bitop_assign(
                    streams,
                    ct.as_mut(),
                    &clear_blocks,
@@ -65,7 +65,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_bitop_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_bitop_assign(
                    streams,
                    ct.as_mut(),
                    &clear_blocks,
@@ -315,23 +315,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -352,7 +354,7 @@ impl CudaServerKey {
        let clear_blocks_mem = (lwe_ciphertext_count.0 * size_of::<u64>()) as u64;

        let scalar_bitop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_scalar_bitop_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_bitop_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -375,7 +377,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_scalar_bitop_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_scalar_bitop_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
@@ -8,9 +8,9 @@ use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaRadixCiphertext};
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    unchecked_are_all_comparisons_block_true_integer_radix_kb_async,
-    unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async,
-    unchecked_scalar_comparison_integer_radix_kb_async, ComparisonType, PBSType,
+    cuda_backend_unchecked_are_all_comparisons_block_true,
+    cuda_backend_unchecked_is_at_least_one_comparisons_block_true,
+    cuda_backend_unchecked_scalar_comparison, ComparisonType, PBSType,
 };
 use crate::shortint::ciphertext::Degree;

@@ -124,7 +124,7 @@ impl CudaServerKey {
                ComparisonType::GT | ComparisonType::GE | ComparisonType::NE => 1,
                _ => 0,
            };
-            let ct_res: T = self.create_trivial_radix(value, 1, streams);
+            let ct_res: T = self.create_trivial_radix_async(value, 1, streams);
            return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
        }

@@ -146,7 +146,7 @@ impl CudaServerKey {
                ComparisonType::LT | ComparisonType::LE | ComparisonType::NE => 1,
                _ => 0,
            };
-            let ct_res: T = self.create_trivial_radix(value, 1, streams);
+            let ct_res: T = self.create_trivial_radix_async(value, 1, streams);
            return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
        }

@@ -173,7 +173,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_scalar_comparison(
                    streams,
                    result.as_mut().as_mut(),
                    ct.as_ref(),
@@ -204,7 +204,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_scalar_comparison(
                    streams,
                    result.as_mut().as_mut(),
                    ct.as_ref(),
@@ -261,9 +261,9 @@ impl CudaServerKey {
                    // Scalar is greater than the bounds, so ciphertext is smaller
                    let result: T = match op {
                        ComparisonType::LT | ComparisonType::LE => {
-                            self.create_trivial_radix(1, num_blocks, streams)
+                            self.create_trivial_radix_async(1, num_blocks, streams)
                        }
-                        _ => self.create_trivial_radix(
+                        _ => self.create_trivial_radix_async(
                            0,
                            ct.as_ref().d_blocks.lwe_ciphertext_count().0,
                            streams,
@@ -275,9 +275,9 @@ impl CudaServerKey {
                    // Scalar is smaller than the bounds, so ciphertext is bigger
                    let result: T = match op {
                        ComparisonType::GT | ComparisonType::GE => {
-                            self.create_trivial_radix(1, num_blocks, streams)
+                            self.create_trivial_radix_async(1, num_blocks, streams)
                        }
-                        _ => self.create_trivial_radix(
+                        _ => self.create_trivial_radix_async(
                            0,
                            ct.as_ref().d_blocks.lwe_ciphertext_count().0,
                            streams,
@@ -296,7 +296,8 @@ impl CudaServerKey {
                    ct, scalar, op, true, streams,
                )
            } else {
-                let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams);
+                let scalar_as_trivial =
+                    self.create_trivial_radix_async(scalar, num_blocks, streams);
                self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams)
            }
        } else {
@@ -334,7 +335,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_scalar_comparison(
                    streams,
                    result.as_mut(),
                    ct.as_ref(),
@@ -365,7 +366,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_comparison_integer_radix_kb_async(
+                cuda_backend_unchecked_scalar_comparison(
                    streams,
                    result.as_mut(),
                    ct.as_ref(),
@@ -412,7 +413,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    unchecked_are_all_comparisons_block_true_integer_radix_kb_async(
+                    cuda_backend_unchecked_are_all_comparisons_block_true(
                        streams,
                        boolean_res.as_mut().as_mut(),
                        ct.as_ref(),
@@ -438,7 +439,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    unchecked_are_all_comparisons_block_true_integer_radix_kb_async(
+                    cuda_backend_unchecked_are_all_comparisons_block_true(
                        streams,
                        boolean_res.as_mut().as_mut(),
                        ct.as_ref(),
@@ -482,7 +483,7 @@ impl CudaServerKey {
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async(
+                    cuda_backend_unchecked_is_at_least_one_comparisons_block_true(
                        streams,
                        boolean_res.as_mut().as_mut(),
                        ct.as_ref(),
@@ -508,7 +509,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async(
+                    cuda_backend_unchecked_is_at_least_one_comparisons_block_true(
                        streams,
                        boolean_res.as_mut().as_mut(),
                        ct.as_ref(),
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_div_mod.rs
@@ -6,14 +6,14 @@ use crate::integer::gpu::ciphertext::{
 };
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_scalar_div_integer_radix_kb_size_on_gpu,
-    get_scalar_div_rem_integer_radix_kb_size_on_gpu,
-    get_signed_scalar_div_integer_radix_kb_size_on_gpu,
-    get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu,
-    unchecked_signed_scalar_div_integer_radix_kb_assign_async,
-    unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async,
-    unchecked_unsigned_scalar_div_integer_radix_kb_assign_async,
-    unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async, CudaServerKey, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_get_scalar_div_rem_size_on_gpu, cuda_backend_get_scalar_div_size_on_gpu,
+    cuda_backend_get_signed_scalar_div_rem_size_on_gpu,
+    cuda_backend_get_signed_scalar_div_size_on_gpu,
+    cuda_backend_unchecked_signed_scalar_div_assign,
+    cuda_backend_unchecked_signed_scalar_div_rem_assign,
+    cuda_backend_unchecked_unsigned_scalar_div_assign,
+    cuda_backend_unchecked_unsigned_scalar_div_rem, CudaServerKey, PBSType,
 };
 use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
 use crate::integer::server_key::radix_parallel::OutputFlag;
@@ -106,7 +106,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_unsigned_scalar_div_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_unsigned_scalar_div_assign(
                    streams,
                    quotient.as_mut(),
                    divisor,
@@ -127,7 +127,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_unsigned_scalar_div_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_unsigned_scalar_div_assign(
                    streams,
                    quotient.as_mut(),
                    divisor,
@@ -281,7 +281,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_unsigned_scalar_div_rem(
                    streams,
                    quotient.as_mut(),
                    remainder.as_mut(),
@@ -303,7 +303,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_unsigned_scalar_div_rem(
                    streams,
                    quotient.as_mut(),
                    remainder.as_mut(),
@@ -549,11 +549,11 @@ impl CudaServerKey {
    >= to the number of bits encrypted in the ciphertext"
        );

-        let mut quotient: CudaSignedRadixCiphertext = numerator.duplicate_async(streams);
+        let mut quotient: CudaSignedRadixCiphertext = numerator.duplicate(streams);

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_signed_scalar_div_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_signed_scalar_div_assign(
                    streams,
                    quotient.as_mut(),
                    divisor,
@@ -574,7 +574,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_signed_scalar_div_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_signed_scalar_div_assign(
                    streams,
                    quotient.as_mut(),
                    divisor,
@@ -729,7 +729,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_signed_scalar_div_rem_assign(
                    streams,
                    quotient.as_mut(),
                    remainder.as_mut(),
@@ -751,7 +751,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_signed_scalar_div_rem_assign(
                    streams,
                    quotient.as_mut(),
                    remainder.as_mut(),
@@ -885,8 +885,7 @@ impl CudaServerKey {
        Scalar: SignedReciprocable + ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
        <<Scalar as SignedReciprocable>::Unsigned as Reciprocable>::DoublePrecision: Send,
    {
-        let (_, remainder) =
-            self.unchecked_signed_scalar_div_rem_async(numerator, divisor, streams);
+        let (_, remainder) = self.unchecked_signed_scalar_div_rem(numerator, divisor, streams);

        remainder
    }
@@ -992,23 +991,25 @@ encrypted bits: {numerator_bits}, scalar bits: {}
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -1028,7 +1029,7 @@ encrypted bits: {numerator_bits}, scalar bits: {}
        };

        let scalar_div_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_scalar_div_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_div_size_on_gpu(
                streams,
                divisor,
                self.message_modulus,
@@ -1046,7 +1047,7 @@ encrypted bits: {numerator_bits}, scalar bits: {}
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_scalar_div_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_scalar_div_size_on_gpu(
                    streams,
                    divisor,
                    self.message_modulus,
@@ -1092,46 +1093,42 @@ encrypted bits: {numerator_bits}, scalar bits: {}
            Scalar::BITS
        );

-        unsafe {
-            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => {
-                    get_scalar_div_rem_integer_radix_kb_size_on_gpu(
-                        streams,
-                        divisor,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        d_bsk.glwe_dimension,
-                        d_bsk.polynomial_size,
-                        d_bsk.input_lwe_dimension,
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_bsk.decomp_level_count,
-                        d_bsk.decomp_base_log,
-                        LweBskGroupingFactor(0),
-                        num_blocks,
-                        PBSType::Classical,
-                        d_bsk.ms_noise_reduction_configuration.as_ref(),
-                    )
-                }
-                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_scalar_div_rem_integer_radix_kb_size_on_gpu(
-                        streams,
-                        divisor,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        d_multibit_bsk.glwe_dimension,
-                        d_multibit_bsk.polynomial_size,
-                        d_multibit_bsk.input_lwe_dimension,
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_multibit_bsk.decomp_level_count,
-                        d_multibit_bsk.decomp_base_log,
-                        d_multibit_bsk.grouping_factor,
-                        num_blocks,
-                        PBSType::MultiBit,
-                        None,
-                    )
-                }
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_div_rem_size_on_gpu(
+                streams,
+                divisor,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                d_bsk.input_lwe_dimension,
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                LweBskGroupingFactor(0),
+                num_blocks,
+                PBSType::Classical,
+                d_bsk.ms_noise_reduction_configuration.as_ref(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                cuda_backend_get_scalar_div_rem_size_on_gpu(
+                    streams,
+                    divisor,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    d_multibit_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.grouping_factor,
+                    num_blocks,
+                    PBSType::MultiBit,
+                    None,
+                )
            }
        }
    }
@@ -1174,27 +1171,25 @@ encrypted bits: {numerator_bits}, scalar bits: {}
        );

        match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                get_signed_scalar_div_integer_radix_kb_size_on_gpu(
-                    streams,
-                    divisor,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    d_bsk.input_lwe_dimension,
-                    d_bsk.decomp_base_log,
-                    d_bsk.decomp_level_count,
-                    self.key_switching_key.decomposition_base_log(),
-                    self.key_switching_key.decomposition_level_count(),
-                    LweBskGroupingFactor(0),
-                    num_blocks,
-                    PBSType::Classical,
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                )
-            }
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_signed_scalar_div_size_on_gpu(
+                streams,
+                divisor,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                d_bsk.input_lwe_dimension,
+                d_bsk.decomp_base_log,
+                d_bsk.decomp_level_count,
+                self.key_switching_key.decomposition_base_log(),
+                self.key_switching_key.decomposition_level_count(),
+                LweBskGroupingFactor(0),
+                num_blocks,
+                PBSType::Classical,
+                d_bsk.ms_noise_reduction_configuration.as_ref(),
+            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_signed_scalar_div_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_signed_scalar_div_size_on_gpu(
                    streams,
                    divisor,
                    self.message_modulus,
@@ -1236,46 +1231,44 @@ encrypted bits: {numerator_bits}, scalar bits: {}
 >= to the number of bits encrypted in the ciphertext"
        );

-        unsafe {
-            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => {
-                    get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu(
-                        streams,
-                        divisor,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        d_bsk.glwe_dimension,
-                        d_bsk.polynomial_size,
-                        d_bsk.input_lwe_dimension,
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_bsk.decomp_level_count,
-                        d_bsk.decomp_base_log,
-                        LweBskGroupingFactor(0),
-                        num_blocks,
-                        PBSType::Classical,
-                        d_bsk.ms_noise_reduction_configuration.as_ref(),
-                    )
-                }
-                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu(
-                        streams,
-                        divisor,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        d_multibit_bsk.glwe_dimension,
-                        d_multibit_bsk.polynomial_size,
-                        d_multibit_bsk.input_lwe_dimension,
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_multibit_bsk.decomp_level_count,
-                        d_multibit_bsk.decomp_base_log,
-                        d_multibit_bsk.grouping_factor,
-                        num_blocks,
-                        PBSType::MultiBit,
-                        None,
-                    )
-                }
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_signed_scalar_div_rem_size_on_gpu(
+                    streams,
+                    divisor,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_bsk.glwe_dimension,
+                    d_bsk.polynomial_size,
+                    d_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count,
+                    d_bsk.decomp_base_log,
+                    LweBskGroupingFactor(0),
+                    num_blocks,
+                    PBSType::Classical,
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                cuda_backend_get_signed_scalar_div_rem_size_on_gpu(
+                    streams,
+                    divisor,
+                    self.message_modulus,
+                    self.carry_modulus,
+                    d_multibit_bsk.glwe_dimension,
+                    d_multibit_bsk.polynomial_size,
+                    d_multibit_bsk.input_lwe_dimension,
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_multibit_bsk.decomp_level_count,
+                    d_multibit_bsk.decomp_base_log,
+                    d_multibit_bsk.grouping_factor,
+                    num_blocks,
+                    PBSType::MultiBit,
+                    None,
+                )
            }
        }
    }
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
@@ -4,8 +4,8 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_scalar_mul_integer_radix_kb_size_on_gpu,
-    unchecked_scalar_mul_integer_radix_kb_async, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_scalar_mul_size_on_gpu,
+    cuda_backend_unchecked_scalar_mul, PBSType,
 };
 use crate::integer::server_key::ScalarMultiplier;
 use crate::prelude::CastInto;
@@ -114,7 +114,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_mul_integer_radix_kb_async(
+                cuda_backend_unchecked_scalar_mul(
                    streams,
                    ct.as_mut(),
                    decomposed_scalar.as_slice(),
@@ -139,7 +139,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_mul_integer_radix_kb_async(
+                cuda_backend_unchecked_scalar_mul(
                    streams,
                    ct.as_mut(),
                    decomposed_scalar.as_slice(),
@@ -286,23 +286,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -328,7 +330,7 @@ impl CudaServerKey {
            return 0;
        }
        let scalar_mul_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_scalar_mul_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_mul_size_on_gpu(
                streams,
                decomposed_scalar.as_slice(),
                self.message_modulus,
@@ -348,7 +350,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_scalar_mul_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_scalar_mul_size_on_gpu(
                    streams,
                    decomposed_scalar.as_slice(),
                    self.message_modulus,
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_rotate.rs
@@ -3,10 +3,11 @@ use crate::core_crypto::prelude::{CastFrom, LweBskGroupingFactor};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_scalar_rotate_left_integer_radix_kb_size_on_gpu,
-    get_scalar_rotate_right_integer_radix_kb_size_on_gpu,
-    unchecked_scalar_rotate_left_integer_radix_kb_assign_async,
-    unchecked_scalar_rotate_right_integer_radix_kb_assign_async, CudaServerKey, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_get_scalar_rotate_left_size_on_gpu,
+    cuda_backend_unchecked_scalar_rotate_left_assign,
+    cuda_backend_unchecked_scalar_rotate_right_assign, get_scalar_rotate_right_size_on_gpu,
+    CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -47,7 +48,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_rotate_left_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_rotate_left_assign(
                    stream,
                    ct.as_mut(),
                    u32::cast_from(n),
@@ -74,7 +75,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_rotate_left_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_rotate_left_assign(
                    stream,
                    ct.as_mut(),
                    u32::cast_from(n),
@@ -156,7 +157,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_rotate_right_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_rotate_right_assign(
                    stream,
                    ct.as_mut(),
                    u32::cast_from(n),
@@ -183,7 +184,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_rotate_right_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_rotate_right_assign(
                    stream,
                    ct.as_mut(),
                    u32::cast_from(n),
@@ -287,23 +288,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -323,7 +326,7 @@ impl CudaServerKey {
        };
        let scalar_shift_mem = match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                get_scalar_rotate_left_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_scalar_rotate_left_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
@@ -346,7 +349,7 @@ impl CudaServerKey {
                )
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_scalar_rotate_left_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_scalar_rotate_left_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
@@ -382,23 +385,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -417,52 +422,48 @@ impl CudaServerKey {
            }
        };
        let scalar_shift_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                get_scalar_rotate_right_integer_radix_kb_size_on_gpu(
-                    streams,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                )
-            }
-            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_scalar_rotate_right_integer_radix_kb_size_on_gpu(
-                    streams,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_multibit_bsk.glwe_dimension,
-                    d_multibit_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_multibit_bsk.decomp_level_count,
-                    d_multibit_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::MultiBit,
-                    d_multibit_bsk.grouping_factor,
-                    None,
-                )
-            }
+            CudaBootstrappingKey::Classic(d_bsk) => get_scalar_rotate_right_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::Classical,
+                LweBskGroupingFactor(0),
+                d_bsk.ms_noise_reduction_configuration.as_ref(),
+            ),
+            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => get_scalar_rotate_right_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_multibit_bsk.glwe_dimension,
+                d_multibit_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_multibit_bsk.decomp_level_count,
+                d_multibit_bsk.decomp_base_log,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::MultiBit,
+                d_multibit_bsk.grouping_factor,
+                None,
+            ),
        };
        full_prop_mem.max(scalar_shift_mem)
    }
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_shift.rs
@@ -3,13 +3,13 @@ use crate::core_crypto::prelude::{CastFrom, LweBskGroupingFactor};
 use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu,
-    get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu,
-    get_scalar_left_shift_integer_radix_kb_size_on_gpu,
-    get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu,
-    unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async,
-    unchecked_scalar_left_shift_integer_radix_kb_assign_async,
-    unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async, CudaServerKey, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu,
+    cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu,
+    cuda_backend_get_scalar_left_shift_size_on_gpu,
+    cuda_backend_get_scalar_logical_right_shift_size_on_gpu,
+    cuda_backend_unchecked_scalar_arithmetic_right_shift_assign,
+    cuda_backend_unchecked_scalar_left_shift_assign,
+    cuda_backend_unchecked_scalar_logical_right_shift_assign, CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -51,7 +51,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_left_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_left_shift_assign(
                    streams,
                    ct.as_mut(),
                    u32::cast_from(shift),
@@ -78,7 +78,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_left_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_left_shift_assign(
                    streams,
                    ct.as_mut(),
                    u32::cast_from(shift),
@@ -198,7 +198,7 @@ impl CudaServerKey {
        if T::IS_SIGNED {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
+                    cuda_backend_unchecked_scalar_arithmetic_right_shift_assign(
                        streams,
                        ct.as_mut(),
                        u32::cast_from(shift),
@@ -224,7 +224,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
+                    cuda_backend_unchecked_scalar_arithmetic_right_shift_assign(
                        streams,
                        ct.as_mut(),
                        u32::cast_from(shift),
@@ -253,7 +253,7 @@ impl CudaServerKey {
        } else {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
+                    cuda_backend_unchecked_scalar_logical_right_shift_assign(
                        streams,
                        ct.as_mut(),
                        u32::cast_from(shift),
@@ -280,7 +280,7 @@ impl CudaServerKey {
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
+                    cuda_backend_unchecked_scalar_logical_right_shift_assign(
                        streams,
                        ct.as_mut(),
                        u32::cast_from(shift),
@@ -596,7 +596,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_logical_right_shift_assign(
                    streams,
                    ct.as_mut(),
                    u32::cast_from(shift),
@@ -623,7 +623,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_scalar_logical_right_shift_assign(
                    streams,
                    ct.as_mut(),
                    u32::cast_from(shift),
@@ -662,23 +662,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -697,31 +699,29 @@ impl CudaServerKey {
            }
        };
        let scalar_shift_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => {
-                get_scalar_left_shift_integer_radix_kb_size_on_gpu(
-                    streams,
-                    self.message_modulus,
-                    self.carry_modulus,
-                    d_bsk.glwe_dimension,
-                    d_bsk.polynomial_size,
-                    self.key_switching_key
-                        .input_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key
-                        .output_key_lwe_size()
-                        .to_lwe_dimension(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count,
-                    d_bsk.decomp_base_log,
-                    lwe_ciphertext_count.0 as u32,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                )
-            }
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_left_shift_size_on_gpu(
+                streams,
+                self.message_modulus,
+                self.carry_modulus,
+                d_bsk.glwe_dimension,
+                d_bsk.polynomial_size,
+                self.key_switching_key
+                    .input_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key
+                    .output_key_lwe_size()
+                    .to_lwe_dimension(),
+                self.key_switching_key.decomposition_level_count(),
+                self.key_switching_key.decomposition_base_log(),
+                d_bsk.decomp_level_count,
+                d_bsk.decomp_base_log,
+                lwe_ciphertext_count.0 as u32,
+                PBSType::Classical,
+                LweBskGroupingFactor(0),
+                d_bsk.ms_noise_reduction_configuration.as_ref(),
+            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_scalar_left_shift_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_scalar_left_shift_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
@@ -757,23 +757,25 @@ impl CudaServerKey {
            0
        } else {
            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                    streams,
-                    d_bsk.input_lwe_dimension(),
-                    d_bsk.glwe_dimension(),
-                    d_bsk.polynomial_size(),
-                    self.key_switching_key.decomposition_level_count(),
-                    self.key_switching_key.decomposition_base_log(),
-                    d_bsk.decomp_level_count(),
-                    d_bsk.decomp_base_log(),
-                    self.message_modulus,
-                    self.carry_modulus,
-                    PBSType::Classical,
-                    LweBskGroupingFactor(0),
-                    d_bsk.ms_noise_reduction_configuration.as_ref(),
-                ),
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
+                        streams,
+                        d_bsk.input_lwe_dimension(),
+                        d_bsk.glwe_dimension(),
+                        d_bsk.polynomial_size(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count(),
+                        d_bsk.decomp_base_log(),
+                        self.message_modulus,
+                        self.carry_modulus,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    )
+                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_full_propagate_assign_size_on_gpu(
+                    cuda_backend_get_full_propagate_assign_size_on_gpu(
                        streams,
                        d_multibit_bsk.input_lwe_dimension(),
                        d_multibit_bsk.glwe_dimension(),
@@ -794,7 +796,7 @@ impl CudaServerKey {
        let scalar_shift_mem = if T::IS_SIGNED {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu(
+                    cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu(
                        streams,
                        self.message_modulus,
                        self.carry_modulus,
@@ -817,7 +819,7 @@ impl CudaServerKey {
                    )
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu(
+                    cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu(
                        streams,
                        self.message_modulus,
                        self.carry_modulus,
@@ -843,7 +845,7 @@ impl CudaServerKey {
        } else {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
-                    get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu(
+                    cuda_backend_get_scalar_logical_right_shift_size_on_gpu(
                        streams,
                        self.message_modulus,
                        self.carry_modulus,
@@ -866,7 +868,7 @@ impl CudaServerKey {
                    )
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu(
+                    cuda_backend_get_scalar_logical_right_shift_size_on_gpu(
                        streams,
                        self.message_modulus,
                        self.carry_modulus,
--- a/tfhe/src/integer/gpu/server_key/radix/shift.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/shift.rs
@@ -3,10 +3,9 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    get_full_propagate_assign_size_on_gpu, get_left_shift_integer_radix_kb_size_on_gpu,
-    get_right_shift_integer_radix_kb_size_on_gpu,
-    unchecked_left_shift_integer_radix_kb_assign_async,
-    unchecked_right_shift_integer_radix_kb_assign_async, CudaServerKey, PBSType,
+    cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_left_shift_size_on_gpu,
+    cuda_backend_get_right_shift_size_on_gpu, cuda_backend_unchecked_left_shift_assign,
+    cuda_backend_unchecked_right_shift_assign, CudaServerKey, PBSType,
 };

 impl CudaServerKey {
@@ -27,7 +26,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_right_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_right_shift_assign(
                    streams,
                    ct.as_mut(),
                    shift.as_ref(),
@@ -55,7 +54,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_right_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_right_shift_assign(
                    streams,
                    ct.as_mut(),
                    shift.as_ref(),
@@ -146,7 +145,7 @@ impl CudaServerKey {

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_left_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_left_shift_assign(
                    streams,
                    ct.as_mut(),
                    shift.as_ref(),
@@ -174,7 +173,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_left_shift_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_left_shift_assign(
                    streams,
                    ct.as_mut(),
                    shift.as_ref(),
@@ -569,23 +568,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -615,7 +616,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

        let shift_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_left_shift_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_left_shift_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -638,7 +639,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_left_shift_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_left_shift_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
@@ -680,23 +681,25 @@ impl CudaServerKey {
            ct_right.as_ref().d_blocks.lwe_ciphertext_count()
        );
        let full_prop_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
-                streams,
-                d_bsk.input_lwe_dimension(),
-                d_bsk.glwe_dimension(),
-                d_bsk.polynomial_size(),
-                self.key_switching_key.decomposition_level_count(),
-                self.key_switching_key.decomposition_base_log(),
-                d_bsk.decomp_level_count(),
-                d_bsk.decomp_base_log(),
-                self.message_modulus,
-                self.carry_modulus,
-                PBSType::Classical,
-                LweBskGroupingFactor(0),
-                d_bsk.ms_noise_reduction_configuration.as_ref(),
-            ),
+            CudaBootstrappingKey::Classic(d_bsk) => {
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
+                    streams,
+                    d_bsk.input_lwe_dimension(),
+                    d_bsk.glwe_dimension(),
+                    d_bsk.polynomial_size(),
+                    self.key_switching_key.decomposition_level_count(),
+                    self.key_switching_key.decomposition_base_log(),
+                    d_bsk.decomp_level_count(),
+                    d_bsk.decomp_base_log(),
+                    self.message_modulus,
+                    self.carry_modulus,
+                    PBSType::Classical,
+                    LweBskGroupingFactor(0),
+                    d_bsk.ms_noise_reduction_configuration.as_ref(),
+                )
+            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_full_propagate_assign_size_on_gpu(
+                cuda_backend_get_full_propagate_assign_size_on_gpu(
                    streams,
                    d_multibit_bsk.input_lwe_dimension(),
                    d_multibit_bsk.glwe_dimension(),
@@ -726,7 +729,7 @@ impl CudaServerKey {
        let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();

        let shift_mem = match &self.bootstrapping_key {
-            CudaBootstrappingKey::Classic(d_bsk) => get_right_shift_integer_radix_kb_size_on_gpu(
+            CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_right_shift_size_on_gpu(
                streams,
                self.message_modulus,
                self.carry_modulus,
@@ -749,7 +752,7 @@ impl CudaServerKey {
                d_bsk.ms_noise_reduction_configuration.as_ref(),
            ),
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                get_right_shift_integer_radix_kb_size_on_gpu(
+                cuda_backend_get_right_shift_size_on_gpu(
                    streams,
                    self.message_modulus,
                    self.carry_modulus,
--- a/tfhe/src/integer/gpu/server_key/radix/sub.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/sub.rs
@@ -8,8 +8,8 @@ use crate::integer::gpu::server_key::CudaServerKey;

 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
-    sub_and_propagate_single_carry_assign_async,
-    unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType,
+    cuda_backend_sub_and_propagate_single_carry_assign,
+    cuda_backend_unchecked_unsigned_overflowing_sub_assign, PBSType,
 };
 use crate::integer::server_key::radix_parallel::OutputFlag;
 use crate::shortint::parameters::LweBskGroupingFactor;
@@ -264,7 +264,7 @@ impl CudaServerKey {
        ) {
            (true, true) => (ct_left, ct_right),
            (true, false) => {
-                tmp_rhs = ct_right.duplicate_async(streams);
+                tmp_rhs = ct_right.duplicate(streams);
                self.full_propagate_assign_async(&mut tmp_rhs, streams);
                (ct_left, &tmp_rhs)
            }
@@ -273,7 +273,7 @@ impl CudaServerKey {
                (ct_left, ct_right)
            }
            (false, false) => {
-                tmp_rhs = ct_right.duplicate_async(streams);
+                tmp_rhs = ct_right.duplicate(streams);

                self.full_propagate_assign_async(ct_left, streams);
                self.full_propagate_assign_async(&mut tmp_rhs, streams);
@@ -281,13 +281,8 @@ impl CudaServerKey {
            }
        };

-        let _carry = self.sub_and_propagate_single_carry_assign_async(
-            lhs,
-            rhs,
-            streams,
-            None,
-            OutputFlag::None,
-        );
+        let _carry =
+            self.sub_and_propagate_single_carry_assign(lhs, rhs, streams, None, OutputFlag::None);
    }

    pub fn get_sub_assign_size_on_gpu<T: CudaIntegerRadixCiphertext>(
@@ -314,22 +309,22 @@ impl CudaServerKey {
            (true, true) => (ct_left, ct_right),
            (true, false) => {
                unsafe {
-                    tmp_rhs = ct_right.duplicate_async(stream);
+                    tmp_rhs = ct_right.duplicate(stream);
                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
                }
                (ct_left, &tmp_rhs)
            }
            (false, true) => {
                unsafe {
-                    tmp_lhs = ct_left.duplicate_async(stream);
+                    tmp_lhs = ct_left.duplicate(stream);
                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
                }
                (&tmp_lhs, ct_right)
            }
            (false, false) => {
                unsafe {
-                    tmp_lhs = ct_left.duplicate_async(stream);
-                    tmp_rhs = ct_right.duplicate_async(stream);
+                    tmp_lhs = ct_left.duplicate(stream);
+                    tmp_rhs = ct_right.duplicate(stream);

                    self.full_propagate_assign_async(&mut tmp_lhs, stream);
                    self.full_propagate_assign_async(&mut tmp_rhs, stream);
@@ -383,17 +378,18 @@ impl CudaServerKey {
        const INPUT_BORROW: Option<&CudaBooleanBlock> = None;

        let mut overflow_block: CudaUnsignedRadixCiphertext =
-            self.create_trivial_zero_radix(1, stream);
+            self.create_trivial_zero_radix_async(1, stream);
        let ciphertext = ct_res.as_mut();
        let uses_input_borrow = INPUT_BORROW.map_or(0u32, |_block| 1u32);

-        let aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
+        let aux_block: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix_async(1, stream);
        let in_carry_dvec =
            INPUT_BORROW.map_or_else(|| aux_block.as_ref(), |block| block.as_ref().as_ref());

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_unsigned_overflowing_sub_assign(
                    stream,
                    ciphertext,
                    rhs.as_ref(),
@@ -418,7 +414,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
+                cuda_backend_unchecked_unsigned_overflowing_sub_assign(
                    stream,
                    ciphertext,
                    rhs.as_ref(),
@@ -452,7 +448,7 @@ impl CudaServerKey {
    ///
    /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
    ///   not be dropped until streams is synchronized
-    pub(crate) unsafe fn sub_and_propagate_single_carry_assign_async<T>(
+    pub(crate) unsafe fn sub_and_propagate_single_carry_assign<T>(
        &self,
        lhs: &mut T,
        rhs: &T,
@@ -463,17 +459,17 @@ impl CudaServerKey {
    where
        T: CudaIntegerRadixCiphertext,
    {
-        let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
+        let mut carry_out: T = self.create_trivial_zero_radix_async(1, streams);

        let num_blocks = lhs.as_mut().d_blocks.lwe_ciphertext_count().0 as u32;
        let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
-        let aux_block: T = self.create_trivial_zero_radix(1, streams);
+        let aux_block: T = self.create_trivial_zero_radix_async(1, streams);
        let in_carry: &CudaRadixCiphertext =
            input_carry.map_or_else(|| aux_block.as_ref(), |block| block.0.as_ref());

        match &self.bootstrapping_key {
            CudaBootstrappingKey::Classic(d_bsk) => {
-                sub_and_propagate_single_carry_assign_async(
+                cuda_backend_sub_and_propagate_single_carry_assign(
                    streams,
                    lhs.as_mut(),
                    rhs.as_ref(),
@@ -499,7 +495,7 @@ impl CudaServerKey {
                );
            }
            CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                sub_and_propagate_single_carry_assign_async(
+                cuda_backend_sub_and_propagate_single_carry_assign(
                    streams,
                    lhs.as_mut(),
                    rhs.as_ref(),