Debug with long run tests

2026-01-11 15:48:20 -05:00 · 2025-05-28 12:15:52 +02:00
20 changed files with 676 additions and 1037 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -400,8 +400,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
-    int8_t *mem_ptr, void *const *bsks,
+    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -1116,116 +1116,18 @@ template <typename Torus> struct int_overflowing_sub_memory {
 };

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
-
-  int_radix_params params;
-  size_t max_total_blocks_in_vec;
-  uint32_t num_blocks_in_radix;
-  uint32_t max_num_radix_in_vec;
-  uint64_t *size_tracker;
-  bool gpu_memory_allocated;
-
-  // temporary buffers
-  CudaRadixCiphertextFFI *current_blocks;
+  CudaRadixCiphertextFFI *new_blocks;
+  CudaRadixCiphertextFFI *new_blocks_copy;
+  CudaRadixCiphertextFFI *old_blocks;
  CudaRadixCiphertextFFI *small_lwe_vector;
+  int_radix_params params;

-  uint32_t *d_columns_data;
-  uint32_t *d_columns_counter;
-  uint32_t **d_columns;
-
-  uint32_t *d_new_columns_data;
-  uint32_t *d_new_columns_counter;
-  uint32_t **d_new_columns;
-
-  uint64_t *d_degrees;
-  uint32_t *d_pbs_counters;
-
-  // lookup table for extracting message and carry
-  int_radix_lut<Torus> *luts_message_carry;
+  int32_t *d_smart_copy_in;
+  int32_t *d_smart_copy_out;

  bool mem_reuse = false;
+  bool gpu_memory_allocated;

-  void setup_index_buffers(cudaStream_t const *streams,
-                           uint32_t const *gpu_indexes) {
-
-    d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
-        max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
-        size_tracker, gpu_memory_allocated);
-
-    d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
-        3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
-        gpu_memory_allocated);
-
-    auto num_blocks_in_radix = this->num_blocks_in_radix;
-    auto max_num_radix_in_vec = this->max_num_radix_in_vec;
-    auto setup_columns =
-        [num_blocks_in_radix, max_num_radix_in_vec, streams,
-         gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
-                      uint32_t *&columns_counter, uint64_t *size_tracker,
-                      bool gpu_memory_allocated) {
-          columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
-              num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
-              streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
-          columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
-              num_blocks_in_radix * sizeof(uint32_t), streams[0],
-              gpu_indexes[0], size_tracker, gpu_memory_allocated);
-          cuda_memset_with_size_tracking_async(
-              columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
-              streams[0], gpu_indexes[0], gpu_memory_allocated);
-          uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
-          for (int i = 0; i < num_blocks_in_radix; ++i) {
-            h_columns[i] = columns_data + i * max_num_radix_in_vec;
-          }
-          columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
-              num_blocks_in_radix * sizeof(uint32_t *), streams[0],
-              gpu_indexes[0], size_tracker, gpu_memory_allocated);
-          cuda_memcpy_async_to_gpu(columns, h_columns,
-                                   num_blocks_in_radix * sizeof(uint32_t *),
-                                   streams[0], gpu_indexes[0]);
-          cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-          delete[] h_columns;
-        };
-
-    setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
-                  gpu_memory_allocated);
-    setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
-                  size_tracker, gpu_memory_allocated);
-  }
-
-  void setup_lookup_tables(cudaStream_t const *streams,
-                           uint32_t const *gpu_indexes, uint32_t gpu_count) {
-    uint32_t message_modulus = params.message_modulus;
-
-    if (!mem_reuse) {
-      luts_message_carry = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
-          gpu_memory_allocated, size_tracker);
-    }
-    auto message_acc = luts_message_carry->get_lut(0, 0);
-    auto carry_acc = luts_message_carry->get_lut(0, 1);
-
-    // define functions for each accumulator
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    // generate accumulators
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], message_acc,
-        luts_message_carry->get_degree(0),
-        luts_message_carry->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, message_modulus, params.carry_modulus,
-        lut_f_message, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], carry_acc,
-        luts_message_carry->get_degree(1),
-        luts_message_carry->get_max_degree(1), params.glwe_dimension,
-        params.polynomial_size, message_modulus, params.carry_modulus,
-        lut_f_carry, gpu_memory_allocated);
-    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
-  }
  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, int_radix_params params,
@@ -1234,84 +1136,103 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
                                 bool allocate_gpu_memory,
                                 uint64_t *size_tracker) {
    this->params = params;
-    this->mem_reuse = false;
-    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
-    this->num_blocks_in_radix = num_blocks_in_radix;
-    this->max_num_radix_in_vec = max_num_radix_in_vec;
-    this->gpu_memory_allocated = allocate_gpu_memory;
-    this->size_tracker = size_tracker;
+    gpu_memory_allocated = allocate_gpu_memory;

-    setup_index_buffers(streams, gpu_indexes);
-    setup_lookup_tables(streams, gpu_indexes, gpu_count);
+    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;

-    // create and allocate intermediate buffers
-    current_blocks = new CudaRadixCiphertextFFI;
+    // allocate gpu memory for intermediate buffers
+    new_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
+        streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    new_blocks_copy = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    old_blocks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
+        streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    cuda_memset_with_size_tracking_async(
+        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
+        gpu_indexes[0], allocate_gpu_memory);
+    cuda_memset_with_size_tracking_async(
+        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
+        gpu_indexes[0], allocate_gpu_memory);
  }

  int_sum_ciphertexts_vec_memory(
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
-      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
-      CudaRadixCiphertextFFI *small_lwe_vector,
-      int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
+      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
+      CudaRadixCiphertextFFI *old_blocks,
+      CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
      uint64_t *size_tracker) {
-    this->mem_reuse = true;
+    mem_reuse = true;
+    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
-    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
-    this->num_blocks_in_radix = num_blocks_in_radix;
-    this->max_num_radix_in_vec = max_num_radix_in_vec;
-    this->gpu_memory_allocated = allocate_gpu_memory;
-    this->size_tracker = size_tracker;

-    this->current_blocks = current_blocks;
+    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
+
+    // assign  gpu memory for intermediate buffers
+    this->new_blocks = new_blocks;
+    this->old_blocks = old_blocks;
    this->small_lwe_vector = small_lwe_vector;
-    this->luts_message_carry = reused_lut;
-    setup_index_buffers(streams, gpu_indexes);
+    new_blocks_copy = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
+        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    cuda_memset_with_size_tracking_async(
+        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
+        gpu_indexes[0], allocate_gpu_memory);
+    cuda_memset_with_size_tracking_async(
+        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
+        gpu_indexes[0], allocate_gpu_memory);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
+    cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);
-
-    cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
-                                       gpu_memory_allocated);
-
-    cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
+    cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);

    if (!mem_reuse) {
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
+                                     gpu_memory_allocated);
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
                                     gpu_memory_allocated);
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     small_lwe_vector, gpu_memory_allocated);
-      luts_message_carry->release(streams, gpu_indexes, gpu_count);
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-      delete current_blocks;
+      delete new_blocks;
+      delete old_blocks;
      delete small_lwe_vector;
-      delete luts_message_carry;
    }
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
+                                   gpu_memory_allocated);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    delete new_blocks_copy;
  }
 };
-
 // For sequential algorithm in group propagation
 template <typename Torus> struct int_seq_group_prop_memory {

@@ -2628,7 +2549,7 @@ template <typename Torus> struct int_mul_memory {
    // radix_lwe_left except the last blocks of each shift
    int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;

-    int total_block_count = num_radix_blocks * num_radix_blocks;
+    int total_block_count = lsb_vector_block_count + msb_vector_block_count;

    // allocate memory for intermediate buffers
    vector_result_sb = new CudaRadixCiphertextFFI;
@@ -2641,13 +2562,13 @@ template <typename Torus> struct int_mul_memory {
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
+        streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);

    // create int_radix_lut objects for lsb, msb, message, carry
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
-                                          params, 2, 2 * total_block_count,
+                                          params, 2, total_block_count,
                                          allocate_gpu_memory, size_tracker);
    auto lsb_acc = luts_array->get_lut(0, 0);
    auto msb_acc = luts_array->get_lut(0, 1);
@@ -2681,10 +2602,9 @@ template <typename Torus> struct int_mul_memory {

    luts_array->broadcast_lut(streams, gpu_indexes, 0);
    // create memory object for sum ciphertexts
-    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
+        2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
        allocate_gpu_memory, size_tracker);
    uint32_t uses_carry = 0;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -520,7 +520,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  if (num_radix_blocks > lut->num_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
+      num_radix_blocks > lwe_array_in->num_radix_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of input & output radix blocks")

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -226,8 +226,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
-    int8_t *mem_ptr, void *const *bsks,
+    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
    void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

@@ -235,59 +234,64 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
+  // FIXME: this should not be necessary, we should make sure sum_ctxt works in
+  // the general case
+  for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
+    radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
+  }
  switch (mem->params.polynomial_size) {
  case 512:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, mem,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, mem,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, mem,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, mem,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, mem,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
-        ms_noise_reduction_key, mem,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -20,11 +20,28 @@
 #include <fstream>
 #include <iostream>
 #include <omp.h>
-#include <queue>
 #include <sstream>
 #include <string>
 #include <vector>

+template <typename Torus>
+__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
+                           int32_t *id_in, size_t lwe_size) {
+  size_t tid = threadIdx.x;
+  size_t b_id = blockIdx.x;
+  size_t stride = blockDim.x;
+
+  auto input_id = id_in[b_id];
+  auto output_id = id_out[b_id];
+
+  auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
+  auto cur_dst = &dst[output_id * lwe_size];
+
+  for (int i = tid; i < lwe_size; i += stride) {
+    cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
+  }
+}
+
 template <typename Torus, class params>
 __global__ void
 all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -77,173 +94,33 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
-                                            uint32_t *const columns_counter,
-                                            const uint64_t *const degrees,
-                                            const uint32_t num_radix_blocks,
-                                            const uint32_t num_radix_in_vec) {
+template <typename Torus>
+__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
+                                uint32_t chunk_size, uint32_t block_size,
+                                uint32_t num_blocks) {

-  const uint32_t idx = threadIdx.x;
-  size_t cnt = 0;
-  for (int i = 0; i < num_radix_in_vec; i++) {
-    size_t ct_id = i * num_radix_blocks + idx;
-    if (degrees[ct_id] != 0) {
-      columns[idx][cnt] = ct_id;
-      ++cnt;
+  size_t stride = blockDim.x;
+  size_t chunk_id = blockIdx.x;
+  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
+  size_t radix_elem_size = num_blocks * block_size;
+  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
+  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
+  size_t block_stride = blockIdx.y * block_size;
+  auto result = &dst_radix[block_stride];
+
+  // init shared mem with first radix of chunk
+  size_t tid = threadIdx.x;
+  for (int i = tid; i < block_size; i += stride) {
+    result[i] = src_chunk[block_stride + i];
+  }
+
+  // accumulate rest  of the radixes
+  for (int r_id = 1; r_id < chunk_size; r_id++) {
+    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
+    for (int i = tid; i < block_size; i += stride) {
+      result[i] += cur_src_radix[block_stride + i];
    }
  }
-  columns_counter[idx] = cnt;
-}
-
-template <typename Torus>
-__global__ inline void prepare_new_columns_and_pbs_indexes(
-    uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
-    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
-    Torus *const lut_indexes, uint32_t *const pbs_counters,
-    const uint32_t *const *const columns, const uint32_t *const columns_counter,
-    const uint32_t chunk_size) {
-  __shared__ uint32_t counter, sharedOr;
-
-  if (threadIdx.x == 0) {
-    counter = 0;
-    sharedOr = 0;
-  }
-  __syncthreads();
-
-  const uint32_t base_id = threadIdx.x;
-  const uint32_t column_len = columns_counter[base_id];
-
-  uint32_t ct_count = 0;
-  for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
-    // those indexes are for message ciphertexts
-    // for message ciphertexts in and out index should be same
-    const uint32_t in_index = columns[base_id][i];
-    new_columns[base_id][ct_count] = in_index;
-    const uint32_t pbs_index = atomicAdd(&counter, 1);
-    pbs_indexes_in[pbs_index] = in_index;
-    pbs_indexes_out[pbs_index] = in_index;
-    lut_indexes[pbs_index] = 0;
-    ++ct_count;
-  }
-//  ct1 ct2 ct3
-// pbs_indexes: 0, 1, 2
-// pbs_indexes: 2, 1, 0
-
-  __syncthreads();
-  uint32_t message_count = counter;
-
-  if (base_id > 0) {
-    const uint32_t prev_base_id = base_id - 1;
-    const uint32_t prev_column_len = columns_counter[prev_base_id];
-
-    for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
-      // those indexes are for carry ciphertexts
-      // for carry ciphertexts input is same as for message
-      // output will be placed to next block in the column
-      const uint32_t in_index = columns[prev_base_id][i];
-      const uint32_t out_index = columns[prev_base_id][i + 1];
-      new_columns[base_id][ct_count] = out_index;
-      const uint32_t pbs_index = atomicAdd(&counter, 1);
-      pbs_indexes_in[pbs_index] = in_index;
-      pbs_indexes_out[pbs_index] = out_index;
-      lut_indexes[pbs_index] = 1;
-      ++ct_count;
-    }
-  }
-
-  const uint32_t start_index = column_len - column_len % chunk_size;
-  for (uint32_t i = start_index; i < column_len; ++i) {
-    new_columns[base_id][ct_count] = columns[base_id][i];
-    ++ct_count;
-  }
-
-  new_columns_counter[base_id] = ct_count;
-
-  if (ct_count > chunk_size) {
-    atomicOr(&sharedOr, 1);
-  }
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    pbs_counters[0] = counter;
-    pbs_counters[1] = message_count;
-    pbs_counters[2] = sharedOr;
-  }
-}
-
-template <typename Torus>
-__global__ inline void prepare_final_pbs_indexes(
-    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
-    Torus *const lut_indexes, const uint32_t num_radix_blocks) {
-  int idx = threadIdx.x;
-  pbs_indexes_in[idx] = idx % num_radix_blocks;
-  pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
-  lut_indexes[idx] = idx / num_radix_blocks;
-}
-
-template <typename Torus>
-__global__ void calculate_chunks(Torus *const input_blocks,
-                                 const uint32_t *const *const columns,
-                                 const uint32_t *const columns_counter,
-                                 const uint32_t chunk_size,
-                                 const uint32_t block_size) {
-
-  const uint32_t part_size = blockDim.x;
-  const uint32_t base_id = blockIdx.x;
-  const uint32_t part_id = blockIdx.y;
-  const uint32_t coef_id = part_id * part_size + threadIdx.x;
-
-  if (coef_id >= block_size)
-    return;
-
-  const uint32_t column_len = columns_counter[base_id];
-
-  if (column_len >= chunk_size) {
-    const uint32_t num_chunks = column_len / chunk_size;
-    Torus result = 0;
-
-    for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
-      const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
-      result = input_blocks[first_ct_id * block_size + coef_id];
-
-      for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
-        const uint32_t cur_ct_id =
-            columns[base_id][chunk_id * chunk_size + ct_id];
-        result += input_blocks[cur_ct_id * block_size + coef_id];
-      }
-
-      input_blocks[first_ct_id * block_size + coef_id] = result;
-    }
-  }
-}
-
-template <typename Torus>
-__global__ void calculate_final_chunk_into_radix(
-    Torus *const out_radix, const Torus *const input_blocks,
-    const uint32_t *const *const columns, const uint32_t *const columns_counter,
-    const uint32_t chunk_size, const uint32_t block_size) {
-
-  const uint32_t part_size = blockDim.x;
-  const uint32_t base_id = blockIdx.x;
-  const uint32_t part_id = blockIdx.y;
-  const uint32_t coef_id = part_id * part_size + threadIdx.x;
-
-  if (coef_id >= block_size)
-    return;
-
-  const uint32_t column_len = columns_counter[base_id];
-
-  Torus result = 0;
-  if (column_len) {
-    const uint32_t first_ct_id = columns[base_id][0];
-    result = input_blocks[first_ct_id * block_size + coef_id];
-
-    for (uint32_t i = 1; i < column_len; ++i) {
-      const uint32_t cur_ct_it = columns[base_id][i];
-      result += input_blocks[cur_ct_it * block_size + coef_id];
-    }
-  }
-  out_radix[base_id * block_size + coef_id] = result;
 }

 template <typename Torus, class params>
@@ -290,65 +167,6 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
-
-inline bool at_least_one_column_needs_processing(
-    const uint64_t *const degrees, const uint32_t num_radix_blocks,
-    const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
-  std::vector<uint32_t> columns_count(num_radix_blocks, 0);
-
-  for (size_t column = 0; column < num_radix_blocks; ++column) {
-    for (size_t block = 0; block < num_radix_in_vec; ++block) {
-      const size_t block_index = block * num_radix_blocks + column;
-      if (degrees[block_index]) {
-        columns_count[column]++;
-        if (columns_count[column] > chunk_size) {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-inline void calculate_final_degrees(uint64_t *const out_degrees,
-                                    const uint64_t *const input_degrees,
-                                    size_t num_blocks, size_t num_radix_in_vec,
-                                    size_t chunk_size,
-                                    uint64_t message_modulus) {
-
-  auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
-    return std::min(message_modulus - 1, degree);
-  };
-  std::vector<std::queue<uint64_t>> columns(num_blocks);
-  for (size_t i = 0; i < num_radix_in_vec; ++i) {
-    for (size_t j = 0; j < num_blocks; ++j) {
-      if (input_degrees[i * num_blocks + j])
-        columns[j].push(input_degrees[i * num_blocks + j]);
-    }
-  }
-
-  for (size_t i = 0; i < num_blocks; ++i) {
-    auto &col = columns[i];
-    while (col.size() > 1) {
-      uint32_t cur_degree = 0;
-      size_t mn = std::min(chunk_size, col.size());
-      for (int j = 0; j < mn; ++j) {
-        cur_degree += col.front();
-        col.pop();
-      }
-      const uint64_t new_degree = get_degree(cur_degree);
-      col.push(new_degree);
-      if ((i + 1) < num_blocks) {
-        columns[i + 1].push(new_degree);
-      }
-    }
-  }
-
-  for (int i = 0; i < num_blocks; i++) {
-    out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
-  }
-}
-
 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -363,107 +181,15 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
  return size_tracker;
 }

-__global__ inline void DEBUG_PRINT_COLUMNS(uint32_t *const *const columns,
-                                            uint32_t *const columns_counter,
-                                            const uint32_t num_radix_blocks) {
-  printf("cuda_columns_counter:\n");
-  for (int i = 0; i < num_radix_blocks; i++) {
-    printf("%d ", columns_counter[i]);
-  }
-  printf("\n");
-  printf("cuda_columns:\n");
-
-  for (int i = 0; i < num_radix_blocks; i++) {
-    printf("column[%d]: ", i);
-    for (int j = 0; j < columns_counter[i]; j++)
-    {
-      printf("%d ", columns[i][j]);
-    }
-    printf("\n");
-  }
-
-  printf("\n");
-
-}
-
-__global__ inline void DEBUG_PRINT_COLUMNS_DATA(uint32_t *const *const columns,
-                                           uint32_t *const columns_counter,
-                                           uint64_t* data,
-                                           const uint32_t num_radix_blocks, size_t lwe_size) {
-
-  uint64_t delta = 576460752303423488ULL;
-  __syncthreads();
-  printf("cuda_new_columns:\n");
-  __syncthreads();
-  for (int i = 0; i < num_radix_blocks; i++) {
-    __syncthreads();
-    printf("column[%d]: ", i);
-    __syncthreads();
-    for (int j = 0; j < columns_counter[i]; j++)
-    {
-      __syncthreads();
-      auto cur_data =data[ columns[i][j] * lwe_size + lwe_size - 1];
-      cur_data /= delta;
-      printf("%llu ", cur_data);
-      __syncthreads();
-    }
-    __syncthreads();
-    printf("\n");
-    __syncthreads();
-  }
-
-  __syncthreads();
-  printf("\n");
-  __syncthreads();
-
-}
-
-template<typename Torus, bool input, bool clear>
-__global__ inline void DEBUG_PRINT_PBS_DATA(Torus * data, Torus* input_indexes, Torus*
-output_indexes, Torus *lut_indexes, size_t lwe_size, int num) {
-  printf("input_output_indexes: \n");
-
-  for (int i = 0; i < num; i++) {
-    auto input_val = data[input_indexes[i] * lwe_size + lwe_size -1];
-    auto output_val = data[output_indexes[i] * lwe_size + lwe_size -1];
-
-    auto val = input ? input_val : output_val;
-    auto val_clear = clear ? val / 576460752303423488ULL : val;
-
-    printf("%d %lu %lu %lu %lu %lu\n", i, input_indexes[i], output_indexes[i], lut_indexes[i],
-           val_clear, val);
-  }
-}
-
-//template<typename Torus>
-//__global__ inline void DEBUG_PRINT_RADIX(Torus * data, size_t num_blocks, size_t lwe_size) {
-//  for (int i = 0; i < num_blocks; i++) {
-//    auto val = data[i * lwe_size + lwe_size - 1];
-//    auto val_clear = val / 576460752303423488ULL;
-//    printf("cuda_partial_sum_result: %lu %lu\n", val, val_clear);
-//  }
-//}
-
 template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *terms, bool reduce_degrees_for_single_carry_propagation, void *const
-    *bsks, uint64_t *const *ksks,
+    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
-//  cudaDeviceSynchronize();
-//  print_body<Torus>("cuda_input_partial_sum", (Torus*)terms->ptr, num_radix_blocks * num_radix_in_vec,
-//                    2048,
-//                    576460752303423488ULL);
-
-//  for (int i = 0; i <num_radix_blocks * num_radix_in_vec; i++ ) {
-//    printf("cuda_input_degrees: %d\n", terms->degrees[i]);
-//  }
-//  cudaDeviceSynchronize();
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
+    uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
+    int_radix_lut<Torus> *reused_lut) {

  if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
    PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -473,28 +199,22 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    PANIC("Cuda error: input vector does not have enough blocks")
  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
    PANIC("Cuda error: output does not have enough blocks")
-
-  auto current_blocks = mem_ptr->current_blocks;
+  auto new_blocks = mem_ptr->new_blocks;
+  auto new_blocks_copy = mem_ptr->new_blocks_copy;
+  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
-  auto d_degrees = mem_ptr->d_degrees;
-  auto d_columns = mem_ptr->d_columns;
-  auto d_columns_counter = mem_ptr->d_columns_counter;
-  auto d_new_columns = mem_ptr->d_new_columns;
-  auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
-  auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
-  auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
-  auto d_pbs_counters = mem_ptr->d_pbs_counters;

-  auto luts_message_carry = mem_ptr->luts_message_carry;
+  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
+  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;

+  auto message_modulus = mem_ptr->params.message_modulus;
+  auto carry_modulus = mem_ptr->params.carry_modulus;
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto chunk_size =
-      (mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
-      (mem_ptr->params.message_modulus - 1);
-
-  size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
+  auto small_lwe_size = small_lwe_dimension + 1;

  // In the case of extracting a single LWE this parameters are dummy
  uint32_t num_many_lut = 1;
@@ -508,202 +228,244 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                             terms, 0, num_radix_blocks);
    return;
  }
-
+  if (old_blocks != terms) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
+                                       terms);
+  }
  if (num_radix_in_vec == 2) {
-    CudaRadixCiphertextFFI terms_slice;
-    as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
-                                     2 * num_radix_blocks);
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
-                         &terms_slice, num_radix_blocks);
+    CudaRadixCiphertextFFI old_blocks_slice;
+    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                     num_radix_blocks, 2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks_slice, num_radix_blocks);
    return;
  }

-  if (current_blocks != terms) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                       current_blocks, terms);
+  size_t r = num_radix_in_vec;
+  size_t total_modulus = message_modulus * carry_modulus;
+  size_t message_max = message_modulus - 1;
+  size_t chunk_size = (total_modulus - 1) / message_max;
+
+  size_t h_lwe_idx_in[terms->num_radix_blocks];
+  size_t h_lwe_idx_out[terms->num_radix_blocks];
+  int32_t h_smart_copy_in[terms->num_radix_blocks];
+  int32_t h_smart_copy_out[terms->num_radix_blocks];
+
+  /// Here it is important to query the default max shared memory on device 0
+  /// instead of cuda_get_max_shared_memory,
+  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
+  auto max_shared_memory = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
+
+  // create lut object for message and carry
+  // we allocate luts_message_carry in the host function (instead of scratch)
+  // to reduce average memory consumption
+  int_radix_lut<Torus> *luts_message_carry;
+  size_t ch_amount = r / chunk_size;
+  if (!ch_amount)
+    ch_amount++;
+  if (reused_lut == nullptr) {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_radix_blocks, true, nullptr);
+  } else {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
  }
+  auto message_acc = luts_message_carry->get_lut(0, 0);
+  auto carry_acc = luts_message_carry->get_lut(0, 1);

-  cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
-                           total_blocks_in_vec * sizeof(uint64_t), streams[0],
-                           gpu_indexes[0]);
+  // define functions for each accumulator
+  auto lut_f_message = [message_modulus](Torus x) -> Torus {
+    return x % message_modulus;
+  };
+  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+    return x / message_modulus;
+  };

-  cuda_set_device(gpu_indexes[0]);
-  radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
-      d_columns, d_columns_counter, d_degrees, num_radix_blocks,
-      num_radix_in_vec);
+  // generate accumulators
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], message_acc,
+      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
+      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+      lut_f_message, true);
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
+      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_carry, true);
+  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

-  bool needs_processing = at_least_one_column_needs_processing(
-      current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);
+  while (r > 2) {
+    size_t cur_total_blocks = r * num_radix_blocks;
+    size_t ch_amount = r / chunk_size;
+    if (!ch_amount)
+      ch_amount++;
+    dim3 add_grid(ch_amount, num_radix_blocks, 1);

-  int number_of_threads = min(256, params::degree);
-  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
-  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
+    cuda_set_device(gpu_indexes[0]);
+    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
+        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
+        std::min(r, chunk_size), big_lwe_size, num_radix_blocks);

-  // h_pbs_counters[0] - total ciphertexts
-  // h_pbs_counters[1] - message ciphertexts
-  // h_pbs_counters[2] - at_least_one_column_needs_processing
-  uint32_t *h_pbs_counters;
-  cudaMallocHost((void **)&h_pbs_counters, 3 * sizeof(uint32_t));
-  if (mem_ptr->mem_reuse) {
-    mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
-  }
+    check_cuda_error(cudaGetLastError());

+    size_t total_count = 0;
+    size_t message_count = 0;
+    size_t carry_count = 0;
+    size_t sm_copy_count = 0;

-  while (needs_processing) {
-//    cudaDeviceSynchronize();
-//    DEBUG_PRINT_COLUMNS<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, num_radix_blocks);
-//    DEBUG_PRINT_COLUMNS_DATA<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, (uint64_t *)
-//    (current_blocks->ptr), num_radix_blocks, big_lwe_size);
-    calculate_chunks<Torus>
-        <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
-            (Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
-            chunk_size, big_lwe_size);
+    generate_ids_update_degrees(
+        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
+        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
+        message_max, total_count, message_count, carry_count, sm_copy_count);
+    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
+    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
+    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
+                                        h_lwe_idx_in, h_lwe_idx_out);

-    prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
-        d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
-        d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
-        d_pbs_counters, d_columns, d_columns_counter, chunk_size);
+    size_t copy_size = sm_copy_count * sizeof(int32_t);
+    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
+                             streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
+                             streams[0], gpu_indexes[0]);

-    cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
-                             3 * sizeof(uint32_t), streams[0], gpu_indexes[0]);
+    // inside d_smart_copy_in there are only -1 values
+    // it's fine to call smart_copy with same pointer
+    // as source and destination
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
+        new_blocks, 0, r * num_radix_blocks);
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
+        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
+        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
+    check_cuda_error(cudaGetLastError());

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    if (carry_count > 0)
+      cuda_set_value_async<Torus>(
+          streams[0], gpu_indexes[0],
+          luts_message_carry->get_lut_indexes(0, message_count), 1,
+          carry_count);

-    const uint32_t total_ciphertexts = h_pbs_counters[0];
-    const uint32_t total_messages = h_pbs_counters[1];
-    needs_processing = (h_pbs_counters[2] != 0);
-
-    auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
-
-//    DEBUG_PRINT_PBS_DATA<Torus, true, true><<<1, 1, 0, streams[0]>>>(
-//        (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
-//        luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
-//        );
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
+    std::vector<Torus *> small_lwe_vector_vec =
+        luts_message_carry->lwe_after_ks_vec;
+    std::vector<Torus *> lwe_after_pbs_vec =
+        luts_message_carry->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec =
+        luts_message_carry->lwe_trivial_indexes_vec;

+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
    if (active_gpu_count == 1) {
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
      execute_keyswitch_async<Torus>(
          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
+          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
+          polynomial_size * glwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
+          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
+          ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
+          small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
+          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+    } else {
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
+          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size);
+
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
          ksks, big_lwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
-          total_messages);
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);

+      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
+      /// different configuration
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
+          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          small_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
-          d_pbs_indexes_out, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
+          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
-          lut_stride);
-    } else {
-      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
-                               luts_message_carry->lwe_indexes_in,
-                               total_ciphertexts * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
-                               luts_message_carry->lwe_indexes_out,
-                               total_ciphertexts * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);

-      luts_message_carry->using_trivial_lwe_indexes = false;
-      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
-
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, active_gpu_count, current_blocks,
-          current_blocks, bsks, ksks, ms_noise_reduction_key,
-          luts_message_carry, total_ciphertexts);
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
+          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          big_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
    }
-    cuda_set_device(gpu_indexes[0]);
-//    DEBUG_PRINT_PBS_DATA<Torus, false, true><<<1, 1, 0, streams[0]>>>(
-//        (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
-//        luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
-//    );
-
-    std::swap(d_columns, d_new_columns);
-    std::swap(d_columns_counter, d_new_columns_counter);
-  }
-
-  cudaFreeHost(h_pbs_counters);
-  calculate_final_chunk_into_radix<Torus>
-      <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
-          (Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
-          d_columns, d_columns_counter, chunk_size, big_lwe_size);
-
-  if (reduce_degrees_for_single_carry_propagation) {
-    prepare_final_pbs_indexes<Torus><<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
-        d_pbs_indexes_in, d_pbs_indexes_out,
-        luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
-
-    cuda_memset_async(
-        (Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
-        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
-
-    auto active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count);
-
-    if (active_gpu_count == 1) {
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
-          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
-          mem_ptr->params.ks_level, num_radix_blocks);
-
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
-          d_pbs_indexes_out, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
-          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
-          polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
-          lut_stride);
-    } else {
-      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
-                               luts_message_carry->lwe_indexes_in,
-                               2 * num_radix_blocks * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
-                               luts_message_carry->lwe_indexes_out,
-                               2 * num_radix_blocks * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
-      luts_message_carry->using_trivial_lwe_indexes = false;
-
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
-          bsks, ksks, ms_noise_reduction_key, luts_message_carry,
-          2 * num_radix_blocks);
+    for (uint i = 0; i < total_count; i++) {
+      auto degrees_index = luts_message_carry->h_lut_indexes[i];
+      new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
+      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
    }
-//    cudaDeviceSynchronize();
-//    print_body<Torus>("cuda_before_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
-//                      576460752303423488ULL);
-//    cudaDeviceSynchronize();
-    calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
-                            num_radix_blocks, num_radix_in_vec, chunk_size,
-                            mem_ptr->params.message_modulus);
-    cuda_set_device(gpu_indexes[0]);
-    CudaRadixCiphertextFFI current_blocks_slice;
-    as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
-                                     num_radix_blocks, 2 * num_radix_blocks);

-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
-                         current_blocks, &current_blocks_slice, num_radix_blocks);
-//    printf("add_happened\n");
+    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
+    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
+
+    if (rem_blocks > 0)
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
+          new_blocks_created + rem_blocks, old_blocks,
+          cur_total_blocks - rem_blocks, cur_total_blocks);
+    std::swap(new_blocks, old_blocks);
+    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
  }
+  luts_message_carry->release(streams, gpu_indexes, gpu_count);
+  delete (luts_message_carry);

-
-//  cudaDeviceSynchronize();
-//
-//  print_body<Torus>("cuda_out_after_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
-//                    576460752303423488ULL);
-//  cudaDeviceSynchronize();
+  CudaRadixCiphertextFFI old_blocks_slice;
+  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                   num_radix_blocks, 2 * num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks_slice, num_radix_blocks);
 }

 template <typename Torus, class params>
@@ -837,28 +599,14 @@ __host__ void host_integer_mult_radix_kb(
    size_t b_id = i % num_blocks;
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
-
-
-  for (int i = 0; i < num_blocks * 2 * num_blocks; i++)
-  {
-    auto cur_ptr = (Torus*)vector_result_sb->ptr;
-    cur_ptr += i * 2049 + 2048;
-    print_debug<Torus>("", cur_ptr, 1);
-  }
-
-  for (int i = 0; i < num_blocks * 2 * num_blocks; i++) {
-    printf("%d\n", vector_result_sb->degrees[i]);
-  }
  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true, bsks,
+      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks);
+      2 * num_blocks, mem_ptr->luts_array);

  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-
-
  host_propagate_single_carry<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
      scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -115,10 +115,13 @@ __host__ void host_integer_scalar_mul_radix(
    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
                                             lwe_array, 0, num_radix_blocks);
  } else {
+    for (int i = 0; i < j * num_radix_blocks; i++) {
+      all_shifted_buffer->degrees[i] = message_modulus - 1;
+    }
    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
-        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true, bsks,
+        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
        ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j);
+        num_radix_blocks, j, nullptr);

    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -37,21 +37,18 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
  printf("\n");
 }

-
 template <typename T>
-__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
+__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
  for (int i = 0; i < N; i++) {
-    T body = src[i * (lwe_dimension + 1) + lwe_dimension];
-    T clear = body / delta;
-    printf("(%lu, %lu), ",  body, clear);
+    printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
  }
 }

 template <typename T>
-void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
+void print_body(const char *name, T *src, int n, int lwe_dimension) {
  printf("%s: ", name);
  cudaDeviceSynchronize();
-  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
+  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
  cudaDeviceSynchronize();
  printf("\n");
 }
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1018,7 +1018,6 @@ unsafe extern "C" {
        gpu_count: u32,
        radix_lwe_out: *mut CudaRadixCiphertextFFI,
        radix_lwe_vec: *mut CudaRadixCiphertextFFI,
-        reduce_degrees_for_single_carry_propagation: bool,
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -3954,7 +3954,6 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
    streams: &CudaStreams,
    result: &mut CudaRadixCiphertext,
    radix_list: &mut CudaRadixCiphertext,
-    reduce_degrees_for_single_carry_propagation: bool,
    bootstrapping_key: &CudaVec<B>,
    keyswitch_key: &CudaVec<T>,
    message_modulus: MessageModulus,
@@ -4049,7 +4048,6 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
        streams.len() as u32,
        &raw mut cuda_ffi_result,
        &raw mut cuda_ffi_radix_list,
-        reduce_degrees_for_single_carry_propagation,
        mem_ptr,
        bootstrapping_key.ptr.as_ptr(),
        keyswitch_key.ptr.as_ptr(),
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -337,7 +337,6 @@ impl CudaServerKey {
        &self,
        result: &mut T,
        ciphertexts: &[T],
-        reduce_degrees_for_single_carry_propagation: bool,
        streams: &CudaStreams,
    ) {
        if ciphertexts.is_empty() {
@@ -378,7 +377,6 @@ impl CudaServerKey {
                    streams,
                    result.as_mut(),
                    &mut terms,
-                    reduce_degrees_for_single_carry_propagation,
                    &d_bsk.d_vec,
                    &self.key_switching_key.d_vec,
                    self.message_modulus,
@@ -404,7 +402,6 @@ impl CudaServerKey {
                    streams,
                    result.as_mut(),
                    &mut terms,
-                    reduce_degrees_for_single_carry_propagation,
                    &d_multibit_bsk.d_vec,
                    &self.key_switching_key.d_vec,
                    self.message_modulus,
@@ -448,7 +445,7 @@ impl CudaServerKey {
        streams: &CudaStreams,
    ) -> T {
        let mut result = self
-            .unchecked_partial_sum_ciphertexts_async(ciphertexts, true, streams)
+            .unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
            .unwrap();

        self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
@@ -461,8 +458,7 @@ impl CudaServerKey {
        ciphertexts: &[T],
        streams: &CudaStreams,
    ) -> Option<T> {
-        let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts,
-                                                                           false, streams) };
+        let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams) };
        streams.synchronize();
        result
    }
@@ -474,7 +470,6 @@ impl CudaServerKey {
    pub unsafe fn unchecked_partial_sum_ciphertexts_async<T: CudaIntegerRadixCiphertext>(
        &self,
        ciphertexts: &[T],
-        reduce_degrees_for_single_carry_propagation: bool,
        streams: &CudaStreams,
    ) -> Option<T> {
        if ciphertexts.is_empty() {
@@ -488,8 +483,7 @@ impl CudaServerKey {
            return Some(result);
        }

-        self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts,
-                                                            reduce_degrees_for_single_carry_propagation, streams);
+        self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts, streams);

        Some(result)
    }
--- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs
@@ -93,7 +93,8 @@ impl CudaServerKey {
            },
        );

-        let mut output_cts: T = self.create_trivial_zero_radix_async(num_ct_blocks, streams);
+        let mut output_cts: T =
+            self.create_trivial_zero_radix_async(num_ct_blocks * num_ct_blocks, streams);

        self.compute_prefix_sum_hillis_steele_async(
            output_cts.as_mut(),
@@ -454,7 +455,7 @@ impl CudaServerKey {
        cts.push(new_trivial);

        let result = self
-            .unchecked_partial_sum_ciphertexts_async(&cts, false, streams)
+            .unchecked_partial_sum_ciphertexts_async(&cts, streams)
            .expect("internal error, empty ciphertext count");

        // This is the part where we extract message and carry blocks
@@ -496,6 +497,28 @@ impl CudaServerKey {
            .as_mut_slice(0..lwe_size, 0)
            .unwrap();

+        let mut carry_blocks_last = carry_blocks
+            .as_mut()
+            .d_blocks
+            .0
+            .d_vec
+            .as_mut_slice(
+                lwe_size * (counter_num_blocks - 1)..lwe_size * counter_num_blocks,
+                0,
+            )
+            .unwrap();
+
+        carry_blocks_last.copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
+        carry_blocks.as_mut().info.blocks.last_mut().unwrap().degree =
+            Degree(self.message_modulus.0 - 1);
+        carry_blocks
+            .as_mut()
+            .info
+            .blocks
+            .last_mut()
+            .unwrap()
+            .noise_level = NoiseLevel::ZERO;
+
        self.apply_lookup_table_async(
            carry_blocks.as_mut(),
            result.as_ref(),
@@ -504,43 +527,10 @@ impl CudaServerKey {
            streams,
        );

-        let mut rotated_carry_blocks: CudaSignedRadixCiphertext =
-            self.create_trivial_zero_radix(counter_num_blocks, streams);
-
-        let mut rotated_slice = rotated_carry_blocks
-            .as_mut()
-            .d_blocks
-            .0
-            .d_vec
-            .as_mut_slice(0..(counter_num_blocks) * lwe_size, 0)
-            .unwrap();
-
-        let first_block;
-        let last_blocks;
-        (first_block, last_blocks) = rotated_slice.split_at_mut(lwe_size, 0);
-
-        let mut tmp_carry_blocks3 = carry_blocks.duplicate(streams);
-        let carry_slice = tmp_carry_blocks3
-            .as_mut()
-            .d_blocks
-            .0
-            .d_vec
-            .as_mut_slice(0..(counter_num_blocks - 1) * lwe_size, 0)
-            .unwrap();
-
-        last_blocks
-            .unwrap()
-            .copy_from_gpu_async(&carry_slice, streams, 0);
-        first_block
-            .unwrap()
-            .copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
        let mut ciphertexts = Vec::<CudaSignedRadixCiphertext>::with_capacity(3);

-        for mut info in &mut rotated_carry_blocks.ciphertext.info.blocks {
-            info.degree = Degree(self.message_modulus.0 - 1);
-        }
        ciphertexts.push(message_blocks);
-        ciphertexts.push(rotated_carry_blocks);
+        ciphertexts.push(carry_blocks);

        let trivial_ct: CudaSignedRadixCiphertext =
            self.create_trivial_radix_async(2u32, counter_num_blocks, streams);
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -1337,14 +1337,6 @@ impl CudaServerKey {
            .unwrap();
        let mut generates_or_propagates_degrees = vec![0; num_blocks];
        let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
-        for (i, block_index) in (block_range.clone()).enumerate() {
-            generates_or_propagates_degrees[i] =
-                generates_or_propagates.info.blocks[block_index].degree.0;
-            generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks
-                [block_index]
-                .noise_level
-                .0;
-        }
        let ct_modulus = output.d_blocks.ciphertext_modulus().raw_modulus_float();
        let mut output_slice = output
            .d_blocks
--- a/tfhe/src/integer/gpu/server_key/radix/tests_long_run/test_random_op_sequence.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_long_run/test_random_op_sequence.rs
@@ -17,39 +17,39 @@ where
    P: Into<TestParameters> + Clone,
 {
    // Binary Ops Executors
-    let add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::add);
-    let sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::sub);
+    //let add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::add);
+    //let sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::sub);
    let bitwise_and_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitand);
    let bitwise_or_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitor);
    let bitwise_xor_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitxor);
    let mul_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::mul);
-    let rotate_left_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_left);
-    let left_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::left_shift);
-    let rotate_right_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_right);
-    let right_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::right_shift);
-    let max_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::max);
-    let min_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::min);
+    //let rotate_left_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_left);
+    //let left_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::left_shift);
+    //let rotate_right_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_right);
+    //let right_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::right_shift);
+    //let max_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::max);
+    //let min_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::min);

    // Binary Ops Clear functions
-    let clear_add = |x, y| x + y;
-    let clear_sub = |x, y| x - y;
+    //let clear_add = |x, y| x + y;
+    //let clear_sub = |x, y| x - y;
    let clear_bitwise_and = |x, y| x & y;
    let clear_bitwise_or = |x, y| x | y;
    let clear_bitwise_xor = |x, y| x ^ y;
    let clear_mul = |x, y| x * y;
    // Warning this rotate definition only works with 64-bit ciphertexts
-    let clear_rotate_left = |x: u64, y: u64| x.rotate_left(y as u32);
-    let clear_left_shift = |x, y| x << y;
-    // Warning this rotate definition only works with 64-bit ciphertexts
-    let clear_rotate_right = |x: u64, y: u64| x.rotate_right(y as u32);
-    let clear_right_shift = |x, y| x >> y;
-    let clear_max = |x: u64, y: u64| max(x, y);
-    let clear_min = |x: u64, y: u64| min(x, y);
+    //let clear_rotate_left = |x: u64, y: u64| x.rotate_left(y as u32);
+    //let clear_left_shift = |x, y| x << y;
+    //// Warning this rotate definition only works with 64-bit ciphertexts
+    //let clear_rotate_right = |x: u64, y: u64| x.rotate_right(y as u32);
+    //let clear_right_shift = |x, y| x >> y;
+    //let clear_max = |x: u64, y: u64| max(x, y);
+    //let clear_min = |x: u64, y: u64| min(x, y);

    #[allow(clippy::type_complexity)]
    let mut binary_ops: Vec<(BinaryOpExecutor, &dyn Fn(u64, u64) -> u64, String)> = vec![
-        (Box::new(add_executor), &clear_add, "add".to_string()),
-        (Box::new(sub_executor), &clear_sub, "sub".to_string()),
+        //(Box::new(add_executor), &clear_add, "add".to_string()),
+        //(Box::new(sub_executor), &clear_sub, "sub".to_string()),
        (
            Box::new(bitwise_and_executor),
            &clear_bitwise_and,
@@ -66,28 +66,28 @@ where
            "bitxor".to_string(),
        ),
        (Box::new(mul_executor), &clear_mul, "mul".to_string()),
-        (
-            Box::new(rotate_left_executor),
-            &clear_rotate_left,
-            "rotate left".to_string(),
-        ),
-        (
-            Box::new(left_shift_executor),
-            &clear_left_shift,
-            "left shift".to_string(),
-        ),
-        (
-            Box::new(rotate_right_executor),
-            &clear_rotate_right,
-            "rotate right".to_string(),
-        ),
-        (
-            Box::new(right_shift_executor),
-            &clear_right_shift,
-            "right shift".to_string(),
-        ),
-        (Box::new(max_executor), &clear_max, "max".to_string()),
-        (Box::new(min_executor), &clear_min, "min".to_string()),
+        //(
+        //    Box::new(rotate_left_executor),
+        //    &clear_rotate_left,
+        //    "rotate left".to_string(),
+        //),
+        //(
+        //    Box::new(left_shift_executor),
+        //    &clear_left_shift,
+        //    "left shift".to_string(),
+        //),
+        //(
+        //    Box::new(rotate_right_executor),
+        //    &clear_rotate_right,
+        //    "rotate right".to_string(),
+        //),
+        //(
+        //    Box::new(right_shift_executor),
+        //    &clear_right_shift,
+        //    "right shift".to_string(),
+        //),
+        //(Box::new(max_executor), &clear_max, "max".to_string()),
+        //(Box::new(min_executor), &clear_min, "min".to_string()),
    ];

    // Unary Ops Executors
@@ -115,8 +115,8 @@ where
    ];

    // Scalar binary Ops Executors
-    let scalar_add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_add);
-    let scalar_sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_sub);
+    //let scalar_add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_add);
+    //let scalar_sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_sub);
    let scalar_bitwise_and_executor =
        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_bitand);
    let scalar_bitwise_or_executor =
@@ -124,27 +124,27 @@ where
    let scalar_bitwise_xor_executor =
        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_bitxor);
    let scalar_mul_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_mul);
-    let scalar_rotate_left_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_left);
-    let scalar_left_shift_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_left_shift);
-    let scalar_rotate_right_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_right);
-    let scalar_right_shift_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_right_shift);
+    //let scalar_rotate_left_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_left);
+    //let scalar_left_shift_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_left_shift);
+    //let scalar_rotate_right_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_right);
+    //let scalar_right_shift_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_right_shift);

    #[allow(clippy::type_complexity)]
    let mut scalar_binary_ops: Vec<(ScalarBinaryOpExecutor, &dyn Fn(u64, u64) -> u64, String)> = vec![
-        (
-            Box::new(scalar_add_executor),
-            &clear_add,
-            "scalar add".to_string(),
-        ),
-        (
-            Box::new(scalar_sub_executor),
-            &clear_sub,
-            "scalar sub".to_string(),
-        ),
+        //(
+        //    Box::new(scalar_add_executor),
+        //    &clear_add,
+        //    "scalar add".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_sub_executor),
+        //    &clear_sub,
+        //    "scalar sub".to_string(),
+        //),
        (
            Box::new(scalar_bitwise_and_executor),
            &clear_bitwise_and,
@@ -165,26 +165,26 @@ where
            &clear_mul,
            "scalar mul".to_string(),
        ),
-        (
-            Box::new(scalar_rotate_left_executor),
-            &clear_rotate_left,
-            "scalar rotate left".to_string(),
-        ),
-        (
-            Box::new(scalar_left_shift_executor),
-            &clear_left_shift,
-            "scalar left shift".to_string(),
-        ),
-        (
-            Box::new(scalar_rotate_right_executor),
-            &clear_rotate_right,
-            "scalar rotate right".to_string(),
-        ),
-        (
-            Box::new(scalar_right_shift_executor),
-            &clear_right_shift,
-            "scalar right shift".to_string(),
-        ),
+        //(
+        //    Box::new(scalar_rotate_left_executor),
+        //    &clear_rotate_left,
+        //    "scalar rotate left".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_left_shift_executor),
+        //    &clear_left_shift,
+        //    "scalar left shift".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_rotate_right_executor),
+        //    &clear_rotate_right,
+        //    "scalar rotate right".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_right_shift_executor),
+        //    &clear_right_shift,
+        //    "scalar right shift".to_string(),
+        //),
    ];

    // Overflowing Ops Executors
@@ -249,37 +249,37 @@ where

    // Comparison Ops Executors
    let gt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::gt);
-    let ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ge);
-    let lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::lt);
-    let le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::le);
-    let eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::eq);
-    let ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ne);
+    //let ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ge);
+    //let lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::lt);
+    //let le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::le);
+    //let eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::eq);
+    //let ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ne);

    // Comparison Ops Clear functions
    let clear_gt = |x: u64, y: u64| -> bool { x > y };
-    let clear_ge = |x: u64, y: u64| -> bool { x >= y };
-    let clear_lt = |x: u64, y: u64| -> bool { x < y };
-    let clear_le = |x: u64, y: u64| -> bool { x <= y };
-    let clear_eq = |x: u64, y: u64| -> bool { x == y };
-    let clear_ne = |x: u64, y: u64| -> bool { x != y };
+    //let clear_ge = |x: u64, y: u64| -> bool { x >= y };
+    //let clear_lt = |x: u64, y: u64| -> bool { x < y };
+    //let clear_le = |x: u64, y: u64| -> bool { x <= y };
+    //let clear_eq = |x: u64, y: u64| -> bool { x == y };
+    //let clear_ne = |x: u64, y: u64| -> bool { x != y };

    #[allow(clippy::type_complexity)]
    let mut comparison_ops: Vec<(ComparisonOpExecutor, &dyn Fn(u64, u64) -> bool, String)> = vec![
        (Box::new(gt_executor), &clear_gt, "gt".to_string()),
-        (Box::new(ge_executor), &clear_ge, "ge".to_string()),
-        (Box::new(lt_executor), &clear_lt, "lt".to_string()),
-        (Box::new(le_executor), &clear_le, "le".to_string()),
-        (Box::new(eq_executor), &clear_eq, "eq".to_string()),
-        (Box::new(ne_executor), &clear_ne, "ne".to_string()),
+        //(Box::new(ge_executor), &clear_ge, "ge".to_string()),
+        //(Box::new(lt_executor), &clear_lt, "lt".to_string()),
+        //(Box::new(le_executor), &clear_le, "le".to_string()),
+        //(Box::new(eq_executor), &clear_eq, "eq".to_string()),
+        //(Box::new(ne_executor), &clear_ne, "ne".to_string()),
    ];

    // Scalar Comparison Ops Executors
    let scalar_gt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_gt);
-    let scalar_ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ge);
-    let scalar_lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_lt);
-    let scalar_le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_le);
-    let scalar_eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_eq);
-    let scalar_ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ne);
+    //let scalar_ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ge);
+    //let scalar_lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_lt);
+    //let scalar_le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_le);
+    //let scalar_eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_eq);
+    //let scalar_ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ne);

    #[allow(clippy::type_complexity)]
    let mut scalar_comparison_ops: Vec<(
@@ -292,31 +292,31 @@ where
            &clear_gt,
            "scalar gt".to_string(),
        ),
-        (
-            Box::new(scalar_ge_executor),
-            &clear_ge,
-            "scalar ge".to_string(),
-        ),
-        (
-            Box::new(scalar_lt_executor),
-            &clear_lt,
-            "scalar lt".to_string(),
-        ),
-        (
-            Box::new(scalar_le_executor),
-            &clear_le,
-            "scalar le".to_string(),
-        ),
-        (
-            Box::new(scalar_eq_executor),
-            &clear_eq,
-            "scalar eq".to_string(),
-        ),
-        (
-            Box::new(scalar_ne_executor),
-            &clear_ne,
-            "scalar ne".to_string(),
-        ),
+        //(
+        //    Box::new(scalar_ge_executor),
+        //    &clear_ge,
+        //    "scalar ge".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_lt_executor),
+        //    &clear_lt,
+        //    "scalar lt".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_le_executor),
+        //    &clear_le,
+        //    "scalar le".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_eq_executor),
+        //    &clear_eq,
+        //    "scalar eq".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_ne_executor),
+        //    &clear_ne,
+        //    "scalar ne".to_string(),
+        //),
    ];

    // Select Executor
--- a/tfhe/src/integer/gpu/server_key/radix/tests_long_run/test_signed_random_op_sequence.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_long_run/test_signed_random_op_sequence.rs
@@ -19,29 +19,29 @@ where
    P: Into<TestParameters> + Clone,
 {
    // Binary Ops Executors
-    let add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::add);
-    let sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::sub);
+    //let add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::add);
+    //let sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::sub);
    let bitwise_and_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitand);
    let bitwise_or_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitor);
    let bitwise_xor_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitxor);
    let mul_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::mul);
-    let max_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::max);
-    let min_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::min);
+    //let max_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::max);
+    //let min_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::min);

    // Binary Ops Clear functions
-    let clear_add = |x, y| x + y;
-    let clear_sub = |x, y| x - y;
+    //let clear_add = |x, y| x + y;
+    //let clear_sub = |x, y| x - y;
    let clear_bitwise_and = |x, y| x & y;
    let clear_bitwise_or = |x, y| x | y;
    let clear_bitwise_xor = |x, y| x ^ y;
    let clear_mul = |x, y| x * y;
-    let clear_max = |x: i64, y: i64| max(x, y);
-    let clear_min = |x: i64, y: i64| min(x, y);
+    //let clear_max = |x: i64, y: i64| max(x, y);
+    //let clear_min = |x: i64, y: i64| min(x, y);

    #[allow(clippy::type_complexity)]
    let mut binary_ops: Vec<(SignedBinaryOpExecutor, &dyn Fn(i64, i64) -> i64, String)> = vec![
-        (Box::new(add_executor), &clear_add, "add".to_string()),
-        (Box::new(sub_executor), &clear_sub, "sub".to_string()),
+        //(Box::new(add_executor), &clear_add, "add".to_string()),
+        //(Box::new(sub_executor), &clear_sub, "sub".to_string()),
        (
            Box::new(bitwise_and_executor),
            &clear_bitwise_and,
@@ -58,14 +58,14 @@ where
            "bitxor".to_string(),
        ),
        (Box::new(mul_executor), &clear_mul, "mul".to_string()),
-        (Box::new(max_executor), &clear_max, "max".to_string()),
-        (Box::new(min_executor), &clear_min, "min".to_string()),
+        //(Box::new(max_executor), &clear_max, "max".to_string()),
+        //(Box::new(min_executor), &clear_min, "min".to_string()),
    ];

    let rotate_left_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_left);
-    let left_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::left_shift);
-    let rotate_right_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_right);
-    let right_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::right_shift);
+    //let left_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::left_shift);
+    //let rotate_right_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_right);
+    //let right_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::right_shift);
    // Warning this rotate definition only works with 64-bit ciphertexts
    let clear_rotate_left = |x: i64, y: u64| x.rotate_left(y as u32);
    let clear_left_shift = |x: i64, y: u64| x << y;
@@ -83,21 +83,21 @@ where
            &clear_rotate_left,
            "rotate left".to_string(),
        ),
-        (
-            Box::new(left_shift_executor),
-            &clear_left_shift,
-            "left shift".to_string(),
-        ),
-        (
-            Box::new(rotate_right_executor),
-            &clear_rotate_right,
-            "rotate right".to_string(),
-        ),
-        (
-            Box::new(right_shift_executor),
-            &clear_right_shift,
-            "right shift".to_string(),
-        ),
+        //(
+        //    Box::new(left_shift_executor),
+        //    &clear_left_shift,
+        //    "left shift".to_string(),
+        //),
+        //(
+        //    Box::new(rotate_right_executor),
+        //    &clear_rotate_right,
+        //    "rotate right".to_string(),
+        //),
+        //(
+        //    Box::new(right_shift_executor),
+        //    &clear_right_shift,
+        //    "right shift".to_string(),
+        //),
    ];

    // Unary Ops Executors
@@ -125,8 +125,8 @@ where
    ];

    // Scalar binary Ops Executors
-    let scalar_add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_add);
-    let scalar_sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_sub);
+    //let scalar_add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_add);
+    //let scalar_sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_sub);
    let scalar_bitwise_and_executor =
        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_bitand);
    let scalar_bitwise_or_executor =
@@ -141,16 +141,16 @@ where
        &dyn Fn(i64, i64) -> i64,
        String,
    )> = vec![
-        (
-            Box::new(scalar_add_executor),
-            &clear_add,
-            "scalar add".to_string(),
-        ),
-        (
-            Box::new(scalar_sub_executor),
-            &clear_sub,
-            "scalar sub".to_string(),
-        ),
+        //(
+        //    Box::new(scalar_add_executor),
+        //    &clear_add,
+        //    "scalar add".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_sub_executor),
+        //    &clear_sub,
+        //    "scalar sub".to_string(),
+        //),
        (
            Box::new(scalar_bitwise_and_executor),
            &clear_bitwise_and,
@@ -175,12 +175,12 @@ where

    let scalar_rotate_left_executor =
        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_left);
-    let scalar_left_shift_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_left_shift);
-    let scalar_rotate_right_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_right);
-    let scalar_right_shift_executor =
-        GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_right_shift);
+    //let scalar_left_shift_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_left_shift);
+    //let scalar_rotate_right_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_right);
+    //let scalar_right_shift_executor =
+    //    GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_right_shift);
    #[allow(clippy::type_complexity)]
    let mut scalar_shift_rotate_ops: Vec<(
        SignedScalarShiftRotateExecutor,
@@ -192,21 +192,21 @@ where
            &clear_rotate_left,
            "scalar rotate left".to_string(),
        ),
-        (
-            Box::new(scalar_left_shift_executor),
-            &clear_left_shift,
-            "scalar left shift".to_string(),
-        ),
-        (
-            Box::new(scalar_rotate_right_executor),
-            &clear_rotate_right,
-            "scalar rotate right".to_string(),
-        ),
-        (
-            Box::new(scalar_right_shift_executor),
-            &clear_right_shift,
-            "scalar right shift".to_string(),
-        ),
+        //(
+        //    Box::new(scalar_left_shift_executor),
+        //    &clear_left_shift,
+        //    "scalar left shift".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_rotate_right_executor),
+        //    &clear_rotate_right,
+        //    "scalar rotate right".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_right_shift_executor),
+        //    &clear_right_shift,
+        //    "scalar right shift".to_string(),
+        //),
    ];

    // Overflowing Ops Executors
@@ -271,11 +271,11 @@ where

    // Comparison Ops Executors
    let gt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::gt);
-    let ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ge);
-    let lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::lt);
-    let le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::le);
-    let eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::eq);
-    let ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ne);
+    //let ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ge);
+    //let lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::lt);
+    //let le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::le);
+    //let eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::eq);
+    //let ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ne);

    // Comparison Ops Clear functions
    let clear_gt = |x: i64, y: i64| -> bool { x > y };
@@ -292,20 +292,20 @@ where
        String,
    )> = vec![
        (Box::new(gt_executor), &clear_gt, "gt".to_string()),
-        (Box::new(ge_executor), &clear_ge, "ge".to_string()),
-        (Box::new(lt_executor), &clear_lt, "lt".to_string()),
-        (Box::new(le_executor), &clear_le, "le".to_string()),
-        (Box::new(eq_executor), &clear_eq, "eq".to_string()),
-        (Box::new(ne_executor), &clear_ne, "ne".to_string()),
+        //(Box::new(ge_executor), &clear_ge, "ge".to_string()),
+        //(Box::new(lt_executor), &clear_lt, "lt".to_string()),
+        //(Box::new(le_executor), &clear_le, "le".to_string()),
+        //(Box::new(eq_executor), &clear_eq, "eq".to_string()),
+        //(Box::new(ne_executor), &clear_ne, "ne".to_string()),
    ];

    // Scalar Comparison Ops Executors
    let scalar_gt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_gt);
-    let scalar_ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ge);
-    let scalar_lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_lt);
-    let scalar_le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_le);
-    let scalar_eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_eq);
-    let scalar_ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ne);
+    //let scalar_ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ge);
+    //let scalar_lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_lt);
+    //let scalar_le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_le);
+    //let scalar_eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_eq);
+    //let scalar_ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ne);

    #[allow(clippy::type_complexity)]
    let mut scalar_comparison_ops: Vec<(
@@ -318,31 +318,31 @@ where
            &clear_gt,
            "scalar gt".to_string(),
        ),
-        (
-            Box::new(scalar_ge_executor),
-            &clear_ge,
-            "scalar ge".to_string(),
-        ),
-        (
-            Box::new(scalar_lt_executor),
-            &clear_lt,
-            "scalar lt".to_string(),
-        ),
-        (
-            Box::new(scalar_le_executor),
-            &clear_le,
-            "scalar le".to_string(),
-        ),
-        (
-            Box::new(scalar_eq_executor),
-            &clear_eq,
-            "scalar eq".to_string(),
-        ),
-        (
-            Box::new(scalar_ne_executor),
-            &clear_ne,
-            "scalar ne".to_string(),
-        ),
+        //(
+        //    Box::new(scalar_ge_executor),
+        //    &clear_ge,
+        //    "scalar ge".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_lt_executor),
+        //    &clear_lt,
+        //    "scalar lt".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_le_executor),
+        //    &clear_le,
+        //    "scalar le".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_eq_executor),
+        //    &clear_eq,
+        //    "scalar eq".to_string(),
+        //),
+        //(
+        //    Box::new(scalar_ne_executor),
+        //    &clear_ne,
+        //    "scalar ne".to_string(),
+        //),
    ];

    // Select Executor
--- a/tfhe/src/integer/server_key/radix_parallel/ilog2.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/ilog2.rs
@@ -289,11 +289,6 @@ impl ServerKey {
            .unchecked_partial_sum_ciphertexts_vec_parallelized(cts, None)
            .expect("internal error, empty ciphertext count");

-        for block in &result.blocks {
-            let val_clear = block.ct.get_body().data / 576460752303423488u64;
-            println!("cpu_after_first_partial_sum: {:?} {:?} {:?}", block.ct.get_body().data,
-                     val_clear, block.degree.0);
-        }
        // This is the part where we extract message and carry blocks
        // while inverting their bits
        let (message_blocks, carry_blocks) = rayon::join(
@@ -341,11 +336,6 @@ impl ServerKey {
            )
            .unwrap();

-        for block in &result.blocks {
-            let val_clear = block.ct.get_body().data / 576460752303423488u64;
-            println!("cpu_after_last_sum: {:?} {:?} {:?}", block.ct.get_body().data,
-                     val_clear, block.degree.0);
-        }
        self.cast_to_unsigned(result, counter_num_blocks)
    }

--- a/tfhe/src/integer/server_key/radix_parallel/mul.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/mul.rs
@@ -450,27 +450,8 @@ impl ServerKey {
            return;
        }

-        // for block in lhs.blocks() {
-        //     println!("lhs_degrees: {:?}", block.degree)
-        // }
-        //
-        // for block in rhs.blocks() {
-        //     println!("rhs_degrees: {:?}", block.degree)
-        // }
-        //
        let terms = self.compute_terms_for_mul_low(lhs, rhs);
-        println!("cpu_terms");

-        for radix_ct  in &terms {
-            for block in radix_ct.blocks() {
-                println!("{:?}", block.ct.get_body().data);
-            }
-        }
-        for radix_ct  in &terms {
-            for block in radix_ct.blocks() {
-                println!("{:?}", block.degree.0);
-            }
-        }
        if let Some(result) = self.unchecked_sum_ciphertexts_vec_parallelized(terms) {
            *lhs = result;
        } else {
--- a/tfhe/src/integer/server_key/radix_parallel/sum.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/sum.rs
@@ -1,5 +1,5 @@
 use crate::integer::ciphertext::IntegerRadixCiphertext;
-use crate::integer::{BooleanBlock, IntegerCiphertext, RadixCiphertext, ServerKey, SignedRadixCiphertext};
+use crate::integer::{BooleanBlock, IntegerCiphertext, RadixCiphertext, ServerKey};
 use crate::shortint::ciphertext::Degree;
 use crate::shortint::Ciphertext;
 use rayon::prelude::*;
@@ -158,17 +158,6 @@ impl ServerKey {
    {
        let mut result =
            self.unchecked_partial_sum_ciphertexts_vec_parallelized(ciphertexts, None)?;
-        println!("cpu_after_sum_ct");
-        for block in result.blocks() {
-            println!("{:?}", block.ct.get_body().data)
-        }
-
-        println!("cpu_after_sum_clear");
-        for block in result.blocks() {
-            let body = block.ct.get_body().data;
-            let delta = 576460752303423488u64;
-            println!("{:?}", body / delta)
-        }

        self.full_propagate_parallelized(&mut result);
        assert!(result.block_carries_are_empty());
@@ -209,14 +198,6 @@ impl ServerKey {
                }
            });

-        for radix in &ciphertexts {
-            //let radix_ct : SignedRadixCiphertext = radix;
-            for block in radix.blocks() {
-                let val_clear = block.ct.get_body().data / 576460752303423488u64;
-                println!("cpu_input_sum: {:?} {:?} {:?}", block.ct.get_body().data,
-                         val_clear, block.degree.0);
-            }
-        }
        self.unchecked_sum_ciphertexts_vec_parallelized(ciphertexts)
    }

--- a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/mod.rs
@@ -3,4 +3,4 @@ pub(crate) mod test_random_op_sequence;
 pub(crate) mod test_signed_erc20;
 pub(crate) mod test_signed_random_op_sequence;
 pub(crate) const NB_CTXT_LONG_RUN: usize = 32;
-pub(crate) const NB_TESTS_LONG_RUN: usize = 20000;
+pub(crate) const NB_TESTS_LONG_RUN: usize = 200;
--- a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/test_random_op_sequence.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/test_random_op_sequence.rs
@@ -588,6 +588,9 @@ pub(crate) fn random_op_sequence_test<P>(
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
            let input_degrees_right: Vec<u64> =
                right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                 res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Input degrees left: {input_degrees_left:?}, right {input_degrees_right:?}, Output degrees {:?}", output_degrees);
            let decrypted_res: u64 = cks.decrypt(&res);
            let expected_res: u64 = clear_fn(clear_left, clear_right);

--- a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/test_signed_random_op_sequence.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/test_signed_random_op_sequence.rs
@@ -680,6 +680,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
            let input_degrees_right: Vec<u64> =
                right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Input degrees left: {input_degrees_left:?}, right {input_degrees_right:?}, Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let expected_res: i64 = clear_fn(clear_left, clear_right);

@@ -731,6 +734,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                "Determinism check failed on unary op {fn_name} with clear input {clear_input}.",
            );
            let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let expected_res: i64 = clear_fn(clear_input);
            if i % 2 == 0 {
@@ -774,6 +780,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
            );
            let input_degrees_left: Vec<u64> =
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let expected_res: i64 = clear_fn(clear_left, clear_right);

@@ -829,6 +838,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
            let input_degrees_right: Vec<u64> =
                right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let decrypt_signed_overflow = cks.decrypt_bool(&overflow);
            let (expected_res, expected_overflow) = clear_fn(clear_left, clear_right);
@@ -889,6 +901,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
            );
            let input_degrees_left: Vec<u64> =
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let decrypt_signed_overflow = cks.decrypt_bool(&overflow);
            let (expected_res, expected_overflow) = clear_fn(clear_left, clear_right);
@@ -1020,6 +1035,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
            let input_degrees_right: Vec<u64> =
                right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let expected_res = clear_fn(clear_bool, clear_left, clear_right);

@@ -1081,6 +1099,12 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
            let input_degrees_right: Vec<u64> =
                right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees_q: Vec<u64> =
+                res_q.blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees_r: Vec<u64> =
+                res_r.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees_q);
+            println!("Output degrees {:?}", output_degrees_r);
            let decrypt_signed_res_q: i64 = cks.decrypt_signed(&res_q);
            let decrypt_signed_res_r: i64 = cks.decrypt_signed(&res_r);
            let (expected_res_q, expected_res_r) = clear_fn(clear_left, clear_right);
@@ -1147,6 +1171,12 @@ pub(crate) fn signed_random_op_sequence_test<P>(
            );
            let input_degrees_left: Vec<u64> =
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_q_degrees: Vec<u64> =
+                res_r.blocks.iter().map(|b| b.degree.0).collect();
+            let output_r_degrees: Vec<u64> =
+                res_r.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output r degrees {:?}", output_r_degrees);
+            println!("Output q degrees {:?}", output_q_degrees);
            let decrypt_signed_res_q: i64 = cks.decrypt_signed(&res_q);
            let decrypt_signed_res_r: i64 = cks.decrypt_signed(&res_r);
            let (expected_res_q, expected_res_r) = clear_fn(clear_left, clear_right);
@@ -1205,6 +1235,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                "Determinism check failed on op {fn_name} with clear input {clear_input}.",
            );
            let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let cast_res = sks.cast_to_signed(res, NB_CTXT_LONG_RUN);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&cast_res);
            let expected_res = clear_fn(clear_input) as i64;
@@ -1252,6 +1285,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
            let input_degrees_right: Vec<u64> =
                unsigned_right.blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let expected_res: i64 = clear_fn(clear_left, clear_right as u64);

@@ -1297,6 +1333,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
            );
            let input_degrees_left: Vec<u64> =
                left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
+            let output_degrees: Vec<u64> =
+                res.blocks.iter().map(|b| b.degree.0).collect();
+            println!("Output degrees {:?}", output_degrees);
            let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
            let expected_res: i64 = clear_fn(clear_left, clear_right as u64);