feat(gpu): Implements optimized division algorithm for message_2_carry_2

2026-04-28 03:01:21 -04:00 · 2025-09-01 15:13:04 +04:00
6 changed files with 1295 additions and 274 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -2,27 +2,7 @@
 #define CUDA_INTEGER_COMPRESSION_H

 #include "../../pbs/pbs_enums.h"
-
-typedef struct {
-  void *ptr;
-  uint32_t num_radix_blocks;
-  uint32_t lwe_dimension;
-} CudaLweCiphertextListFFI;
-
-typedef struct {
-  void *ptr;
-  uint32_t storage_log_modulus;
-  uint32_t lwe_per_glwe;
-  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
-  // smaller)
-  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
-  // each LWE of the group). In the end the total number of bodies is equal to
-  // the number of input LWE
-  uint32_t total_lwe_bodies_count;
-  uint32_t glwe_dimension;
-  uint32_t polynomial_size;
-} CudaPackedGlweCiphertextListFFI;
-
+#include "../integer.h"
 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -80,6 +80,26 @@ typedef struct {
  bool const divisor_has_more_bits_than_numerator;
 } CudaScalarDivisorFFI;

+typedef struct {
+  void *ptr;
+  uint32_t num_radix_blocks;
+  uint32_t lwe_dimension;
+} CudaLweCiphertextListFFI;
+
+typedef struct {
+  void *ptr;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
+  // smaller)
+  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
+  // each LWE of the group). In the end the total number of bodies is equal to
+  // the number of input LWE
+  uint32_t total_lwe_bodies_count;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+} CudaPackedGlweCiphertextListFFI;
+
 uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -4153,6 +4153,771 @@ template <typename Torus> struct int_comparison_buffer {
  }
 };

+template <typename Torus> struct int_sub_and_propagate {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  CudaRadixCiphertextFFI *neg_rhs_array;
+
+  int_sc_prop_memory<Torus> *sc_prop_mem;
+
+  int_sub_and_propagate(cudaStream_t const *streams,
+                        uint32_t const *gpu_indexes, uint32_t gpu_count,
+                        const int_radix_params params,
+                        uint32_t num_radix_blocks, uint32_t requested_flag_in,
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->sc_prop_mem = new int_sc_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
+
+    this->neg_rhs_array = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
+    delete sc_prop_mem;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
+                                   allocate_gpu_memory);
+    delete neg_rhs_array;
+  }
+};
+
+template <typename Torus> struct int_bitop_buffer {
+
+  int_radix_params params;
+  int_radix_lut<Torus> *lut;
+  BITOP_TYPE op;
+  bool gpu_memory_allocated;
+
+  int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                   uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
+                   uint32_t num_radix_blocks, bool allocate_gpu_memory,
+                   uint64_t &size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    this->op = op;
+    this->params = params;
+
+    switch (op) {
+    case BITAND:
+    case BITOR:
+    case BITXOR:
+      lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                     num_radix_blocks, allocate_gpu_memory,
+                                     size_tracker);
+      {
+        auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
+          if (op == BITOP_TYPE::BITAND) {
+            // AND
+            return lhs & rhs;
+          } else if (op == BITOP_TYPE::BITOR) {
+            // OR
+            return lhs | rhs;
+          } else {
+            // XOR
+            return lhs ^ rhs;
+          }
+        };
+
+        generate_device_accumulator_bivariate<Torus>(
+            streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
+            lut->get_max_degree(0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
+        lut->broadcast_lut(streams, gpu_indexes);
+      }
+      break;
+    default:
+      // Scalar OP
+      lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                     params.message_modulus, num_radix_blocks,
+                                     allocate_gpu_memory, size_tracker);
+
+      for (int i = 0; i < params.message_modulus; i++) {
+        auto rhs = i;
+
+        auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
+          if (op == BITOP_TYPE::SCALAR_BITAND) {
+            // AND
+            return x & rhs;
+          } else if (op == BITOP_TYPE::SCALAR_BITOR) {
+            // OR
+            return x | rhs;
+          } else {
+            // XOR
+            return x ^ rhs;
+          }
+        };
+        generate_device_accumulator<Torus>(
+            streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
+            lut->get_max_degree(i), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_univariate_scalar_f,
+            gpu_memory_allocated);
+        lut->broadcast_lut(streams, gpu_indexes);
+      }
+    }
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    lut->release(streams, gpu_indexes, gpu_count);
+    delete lut;
+  }
+};
+
+template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
+  bool gpu_memory_allocated;
+
+  int_radix_params params;
+  uint32_t active_gpu_count;
+
+  // memory objects for other operations
+  int_borrow_prop_memory<Torus> *overflow_sub_mem_1;
+  int_borrow_prop_memory<Torus> *overflow_sub_mem_2;
+  int_borrow_prop_memory<Torus> *overflow_sub_mem_3;
+  int_comparison_buffer<Torus> *comparison_buffer_1;
+  int_comparison_buffer<Torus> *comparison_buffer_2;
+  int_comparison_buffer<Torus> *comparison_buffer_3;
+  int_sub_and_propagate<Torus> *sub_and_propagate_mem;
+  int_bitop_buffer<Torus> *bitor_mem_1;
+  int_bitop_buffer<Torus> *bitor_mem_2;
+  int_bitop_buffer<Torus> *bitor_mem_3;
+  int_logical_scalar_shift_buffer<Torus> *shift_mem;
+
+  // lookup tables
+  int_radix_lut<Torus> *message_extract_lut_1;
+  int_radix_lut<Torus> *message_extract_lut_2;
+  int_radix_lut<Torus> *zero_out_if_not_1_lut_1;
+  int_radix_lut<Torus> *zero_out_if_not_1_lut_2;
+  int_radix_lut<Torus> *zero_out_if_not_2_lut_1;
+  int_radix_lut<Torus> *zero_out_if_not_2_lut_2;
+  int_radix_lut<Torus> *quotient_lut_1;
+  int_radix_lut<Torus> *quotient_lut_2;
+  int_radix_lut<Torus> *quotient_lut_3;
+
+  // sub streams
+  cudaStream_t *sub_streams_1;
+  cudaStream_t *sub_streams_2;
+  cudaStream_t *sub_streams_3;
+  cudaStream_t *sub_streams_4;
+  cudaStream_t *sub_streams_5;
+  cudaStream_t *sub_streams_6;
+  cudaStream_t *sub_streams_7;
+
+  // temporary device buffers
+  CudaRadixCiphertextFFI *d1;                  // num_blocks + 1
+  CudaRadixCiphertextFFI *d2;                  // num_blocks + 1
+  CudaRadixCiphertextFFI *d3;                  // num_blocks + 1
+  CudaRadixCiphertextFFI *low1;                // num_blocks
+  CudaRadixCiphertextFFI *low2;                // num_blocks
+  CudaRadixCiphertextFFI *low3;                // num_blocks
+  CudaRadixCiphertextFFI *rem;                 // num_blocks
+  CudaRadixCiphertextFFI *sub_result_1;        // num_blocks
+  CudaRadixCiphertextFFI *sub_result_2;        // num_blocks
+  CudaRadixCiphertextFFI *sub_result_3;        // num_blocks
+  CudaRadixCiphertextFFI *sub_1_overflowed;    // num_blocks
+  CudaRadixCiphertextFFI *sub_2_overflowed;    // num_blocks
+  CudaRadixCiphertextFFI *sub_3_overflowed;    // num_blocks
+  CudaRadixCiphertextFFI *comparison_blocks_1; // num_blocks
+  CudaRadixCiphertextFFI *comparison_blocks_2; // num_blocks
+  CudaRadixCiphertextFFI *comparison_blocks_3; // num_blocks
+  CudaRadixCiphertextFFI *cmp_1;               // boolean block
+  CudaRadixCiphertextFFI *cmp_2;               // boolean block
+  CudaRadixCiphertextFFI *cmp_3;               // boolean block
+  CudaRadixCiphertextFFI *c0;                  // single block
+  CudaRadixCiphertextFFI *c1;                  // single block
+  CudaRadixCiphertextFFI *c2;                  // single block
+  CudaRadixCiphertextFFI *c3;                  // single block
+  CudaRadixCiphertextFFI *q1;                  // single block
+  CudaRadixCiphertextFFI *q2;                  // single block
+  CudaRadixCiphertextFFI *q3;                  // single block
+
+  Torus **first_indexes_for_overflow_sub;
+  Torus **second_indexes_for_overflow_sub;
+  Torus **scalars_for_overflow_sub;
+  uint32_t max_indexes_to_erase;
+
+  // allocate and initialize if needed, temporary arrays used to calculate
+  // cuda integer div_rem_2_2 operation
+  void init_temporary_buffers(cudaStream_t const *streams,
+                              uint32_t const *gpu_indexes, uint32_t gpu_count,
+                              uint32_t num_blocks, bool allocate_gpu_memory,
+                              uint64_t &size_tracker) {
+    // more than one block temporary arrays
+    d1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], d1, num_blocks + 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    d2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], d2, num_blocks + 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    d3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], d3, num_blocks + 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    low1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], low1, num_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    low2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], low2, num_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    low3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], low3, num_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    rem = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], rem, num_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    sub_result_1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], sub_result_1, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    sub_result_2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], sub_result_2, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    sub_result_3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], sub_result_3, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    sub_1_overflowed = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], sub_1_overflowed, 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    sub_2_overflowed = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], sub_2_overflowed, 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    sub_3_overflowed = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], sub_3_overflowed, 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    comparison_blocks_1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    comparison_blocks_2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    comparison_blocks_3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    // boolean blocks or single block temporary arrays
+    cmp_1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    cmp_2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    cmp_3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    c0 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    c1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    c2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    c3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    q1 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    q2 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    q3 = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+  }
+
+  // initialize lookup tables for div_rem_2_2 operation
+  void init_lookup_tables(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, uint32_t gpu_count,
+                          uint32_t num_blocks, bool allocate_gpu_memory,
+                          uint64_t &size_tracker) {
+    message_extract_lut_1 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_blocks, allocate_gpu_memory, size_tracker);
+    message_extract_lut_2 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_blocks, allocate_gpu_memory, size_tracker);
+    zero_out_if_not_1_lut_1 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_blocks, allocate_gpu_memory, size_tracker);
+    zero_out_if_not_1_lut_2 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_blocks, allocate_gpu_memory, size_tracker);
+    zero_out_if_not_2_lut_1 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_blocks, allocate_gpu_memory, size_tracker);
+    zero_out_if_not_2_lut_2 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
+                                 num_blocks, allocate_gpu_memory, size_tracker);
+    quotient_lut_1 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
+                                 allocate_gpu_memory, size_tracker);
+    quotient_lut_2 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
+                                 allocate_gpu_memory, size_tracker);
+    quotient_lut_3 =
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
+                                 allocate_gpu_memory, size_tracker);
+
+    auto message_modulus = params.message_modulus;
+    auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+
+    auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
+      Torus block = x / 2;
+      bool condition = (x & 1) == 1;
+      return block * (Torus)condition;
+    };
+    auto zero_out_if_not_2_lut_f = [](Torus x) -> Torus {
+      Torus block = x / 3;
+      bool condition = (x % 3) == 2;
+      return block * (Torus)condition;
+    };
+    auto quotient_lut_1_f = [](Torus cond) -> Torus {
+      return (Torus)(cond == 2);
+    };
+    auto quotient_lut_2_f = [](Torus cond) -> Torus {
+      return (Torus)((cond == 2) * 2);
+    };
+    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
+    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
+                                     message_extract_lut_2};
+    for (int j = 0; j < 2; j++) {
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      luts[j]->broadcast_lut(streams, gpu_indexes);
+    }
+
+    luts[0] = zero_out_if_not_1_lut_1;
+    luts[1] = zero_out_if_not_1_lut_2;
+    for (int j = 0; j < 2; j++) {
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
+      luts[j]->broadcast_lut(streams, gpu_indexes);
+    }
+
+    luts[0] = zero_out_if_not_2_lut_1;
+    luts[1] = zero_out_if_not_2_lut_2;
+    for (int j = 0; j < 2; j++) {
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
+      luts[j]->broadcast_lut(streams, gpu_indexes);
+    }
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], quotient_lut_1->get_lut(0, 0),
+        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
+    quotient_lut_1->broadcast_lut(streams, gpu_indexes);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], quotient_lut_2->get_lut(0, 0),
+        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
+    quotient_lut_2->broadcast_lut(streams, gpu_indexes);
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], quotient_lut_3->get_lut(0, 0),
+        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
+    quotient_lut_3->broadcast_lut(streams, gpu_indexes);
+  }
+
+  unsigned_int_div_rem_2_2_memory(cudaStream_t const *streams,
+                                  uint32_t const *gpu_indexes,
+                                  uint32_t gpu_count, int_radix_params params,
+                                  uint32_t num_blocks, bool allocate_gpu_memory,
+                                  uint64_t &size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
+    this->params = params;
+
+    uint32_t compute_overflow = 1;
+    overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
+        allocate_gpu_memory, size_tracker);
+    overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
+        allocate_gpu_memory, size_tracker);
+    overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
+        allocate_gpu_memory, size_tracker);
+    uint32_t group_size = overflow_sub_mem_1->group_size;
+    bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
+                       ->use_sequential_algorithm_to_resolve_group_carries;
+    create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
+                                    group_size, use_seq, allocate_gpu_memory,
+                                    size_tracker);
+    comparison_buffer_1 = new int_comparison_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
+        num_blocks, false, allocate_gpu_memory, size_tracker);
+    comparison_buffer_2 = new int_comparison_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
+        num_blocks, false, allocate_gpu_memory, size_tracker);
+    comparison_buffer_3 = new int_comparison_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
+        num_blocks, false, allocate_gpu_memory, size_tracker);
+    sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_blocks + 1,
+        outputFlag::FLAG_NONE, allocate_gpu_memory, size_tracker);
+    bitor_mem_1 = new int_bitop_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
+        allocate_gpu_memory, size_tracker);
+    bitor_mem_2 = new int_bitop_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
+        allocate_gpu_memory, size_tracker);
+    bitor_mem_3 = new int_bitop_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
+        allocate_gpu_memory, size_tracker);
+    shift_mem = new int_logical_scalar_shift_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
+        params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
+
+    init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
+                       allocate_gpu_memory, size_tracker);
+    init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks,
+                           allocate_gpu_memory, size_tracker);
+
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_3 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_4 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_5 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_6 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_7 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
+      sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
+    }
+  }
+
+  void create_indexes_for_overflow_sub(cudaStream_t const *streams,
+                                       uint32_t const *gpu_indexes,
+                                       uint32_t num_blocks, uint32_t group_size,
+                                       bool use_seq, bool allocate_gpu_memory,
+                                       uint64_t &size_tracker) {
+    max_indexes_to_erase = num_blocks;
+
+    first_indexes_for_overflow_sub =
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    second_indexes_for_overflow_sub =
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));
+
+    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
+
+    // Extra indexes for the luts in first step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      first_indexes_for_overflow_sub[nb - 1] =
+          (Torus *)cuda_malloc_with_size_tracking_async(
+              nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
+              allocate_gpu_memory);
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+        bool is_last_index = (index == (nb - 1));
+        if (is_last_index) {
+          if (nb == 1) {
+            h_lut_indexes[index] = 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2;
+          }
+        } else if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
+    }
+    // Extra indexes for the luts in second step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      second_indexes_for_overflow_sub[nb - 1] =
+          (Torus *)cuda_malloc_with_size_tracking_async(
+              nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
+              allocate_gpu_memory);
+      scalars_for_overflow_sub[nb - 1] =
+          (Torus *)cuda_malloc_with_size_tracking_async(
+              nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
+              allocate_gpu_memory);
+
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+
+        if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else if (index_in_grouping == (group_size - 1)) {
+          if (use_seq) {
+            int inner_index = (grouping_index - 1) % (group_size - 1);
+            h_lut_indexes[index] = inner_index + 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2 * group_size;
+          }
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+
+        bool may_have_its_padding_bit_set =
+            !is_in_first_grouping && (index_in_grouping == group_size - 1);
+
+        if (may_have_its_padding_bit_set) {
+          if (use_seq) {
+            h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1));
+          } else {
+            h_scalar[index] = 1;
+          }
+        } else {
+          h_scalar[index] = 0;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
+          streams[0], gpu_indexes[0], allocate_gpu_memory);
+    }
+    free(h_lut_indexes);
+    free(h_scalar);
+  };
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    // release and delete integer ops memory objects
+    overflow_sub_mem_1->release(streams, gpu_indexes, gpu_count);
+    overflow_sub_mem_2->release(streams, gpu_indexes, gpu_count);
+    overflow_sub_mem_3->release(streams, gpu_indexes, gpu_count);
+    comparison_buffer_1->release(streams, gpu_indexes, gpu_count);
+    comparison_buffer_2->release(streams, gpu_indexes, gpu_count);
+    comparison_buffer_3->release(streams, gpu_indexes, gpu_count);
+    sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
+    bitor_mem_1->release(streams, gpu_indexes, gpu_count);
+    bitor_mem_2->release(streams, gpu_indexes, gpu_count);
+    bitor_mem_3->release(streams, gpu_indexes, gpu_count);
+    shift_mem->release(streams, gpu_indexes, gpu_count);
+
+    delete overflow_sub_mem_1;
+    delete overflow_sub_mem_2;
+    delete overflow_sub_mem_3;
+    delete comparison_buffer_1;
+    delete comparison_buffer_2;
+    delete comparison_buffer_3;
+    delete sub_and_propagate_mem;
+    delete bitor_mem_1;
+    delete bitor_mem_2;
+    delete bitor_mem_3;
+    delete shift_mem;
+
+    // release and delete lut objects
+    message_extract_lut_1->release(streams, gpu_indexes, gpu_count);
+    message_extract_lut_2->release(streams, gpu_indexes, gpu_count);
+    zero_out_if_not_1_lut_1->release(streams, gpu_indexes, gpu_count);
+    zero_out_if_not_1_lut_2->release(streams, gpu_indexes, gpu_count);
+    zero_out_if_not_2_lut_1->release(streams, gpu_indexes, gpu_count);
+    zero_out_if_not_2_lut_2->release(streams, gpu_indexes, gpu_count);
+    quotient_lut_1->release(streams, gpu_indexes, gpu_count);
+    quotient_lut_2->release(streams, gpu_indexes, gpu_count);
+    quotient_lut_3->release(streams, gpu_indexes, gpu_count);
+
+    delete message_extract_lut_1;
+    delete message_extract_lut_2;
+    delete zero_out_if_not_1_lut_1;
+    delete zero_out_if_not_1_lut_2;
+    delete zero_out_if_not_2_lut_1;
+    delete zero_out_if_not_2_lut_2;
+    delete quotient_lut_1;
+    delete quotient_lut_2;
+    delete quotient_lut_3;
+
+    // release and delete temporary buffers
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], d1,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], d2,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], d3,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], low1,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], low2,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], low3,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], rem,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_1,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_2,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_3,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_1_overflowed,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_2_overflowed,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_3_overflowed,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   comparison_blocks_1, gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   comparison_blocks_2, gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   comparison_blocks_3, gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_1,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_2,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_3,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], c0,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], c1,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], c2,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], c3,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], q1,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], q2,
+                                   gpu_memory_allocated);
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], q3,
+                                   gpu_memory_allocated);
+
+    delete d1;
+    delete d2;
+    delete d3;
+    delete low1;
+    delete low2;
+    delete low3;
+    delete rem;
+    delete sub_result_1;
+    delete sub_result_2;
+    delete sub_result_3;
+    delete sub_1_overflowed;
+    delete sub_2_overflowed;
+    delete sub_3_overflowed;
+    delete comparison_blocks_1;
+    delete comparison_blocks_2;
+    delete comparison_blocks_3;
+    delete cmp_1;
+    delete cmp_2;
+    delete cmp_3;
+    delete c0;
+    delete c1;
+    delete c2;
+    delete c3;
+    delete q1;
+    delete q2;
+    delete q3;
+
+    for (int i = 0; i < max_indexes_to_erase; i++) {
+      cuda_drop_with_size_tracking_async(first_indexes_for_overflow_sub[i],
+                                         streams[0], gpu_indexes[0],
+                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(second_indexes_for_overflow_sub[i],
+                                         streams[0], gpu_indexes[0],
+                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(scalars_for_overflow_sub[i],
+                                         streams[0], gpu_indexes[0],
+                                         gpu_memory_allocated);
+    }
+    free(first_indexes_for_overflow_sub);
+    free(second_indexes_for_overflow_sub);
+    free(scalars_for_overflow_sub);
+  }
+};
+
 template <typename Torus> struct unsigned_int_div_rem_memory {
  int_radix_params params;
  uint32_t active_gpu_count;
@@ -4162,6 +4927,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
  int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
  int_borrow_prop_memory<Torus> *overflow_sub_mem;
  int_comparison_buffer<Torus> *comparison_buffer;
+  unsigned_int_div_rem_2_2_memory<Torus> *div_rem_2_2_mem;

  // lookup tables
  int_radix_lut<Torus> **masking_luts_1;
@@ -4209,7 +4975,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint32_t const *gpu_indexes, uint32_t gpu_count,
                              uint32_t num_blocks, bool allocate_gpu_memory,
                              uint64_t &size_tracker) {
-
    // non boolean temporary arrays, with `num_blocks` blocks
    remainder1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -4349,7 +5114,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      luts[j]->broadcast_lut(streams, gpu_indexes);
    }

-    // Give name to closures to improve readability
    auto overflow_happened = [](uint64_t overflow_sum) {
      return overflow_sum != 0;
    };
@@ -4458,8 +5222,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
-
    this->params = params;
+
+    if (params.message_modulus == 4 && params.carry_modulus == 4) {
+      div_rem_2_2_mem = new unsigned_int_div_rem_2_2_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_blocks,
+          allocate_gpu_memory, size_tracker);
+      return;
+    }
+
    shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
        streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
        params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
@@ -4602,6 +5373,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
+
+    if (params.message_modulus == 4 && params.carry_modulus == 4) {
+      div_rem_2_2_mem->release(streams, gpu_indexes, gpu_count);
+      delete div_rem_2_2_mem;
+      return;
+    }
    uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);

    // release and delete other operation memory objects
@@ -4609,6 +5386,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
    shift_mem_2->release(streams, gpu_indexes, gpu_count);
    overflow_sub_mem->release(streams, gpu_indexes, gpu_count);
    comparison_buffer->release(streams, gpu_indexes, gpu_count);
+
    delete shift_mem_1;
    delete shift_mem_2;
    delete overflow_sub_mem;
@@ -4750,89 +5528,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
  }
 };

-template <typename Torus> struct int_bitop_buffer {
-
-  int_radix_params params;
-  int_radix_lut<Torus> *lut;
-  BITOP_TYPE op;
-  bool gpu_memory_allocated;
-
-  int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                   uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
-                   uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                   uint64_t &size_tracker) {
-    gpu_memory_allocated = allocate_gpu_memory;
-    this->op = op;
-    this->params = params;
-
-    switch (op) {
-    case BITAND:
-    case BITOR:
-    case BITXOR:
-      lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
-                                     num_radix_blocks, allocate_gpu_memory,
-                                     size_tracker);
-      {
-        auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
-          if (op == BITOP_TYPE::BITAND) {
-            // AND
-            return lhs & rhs;
-          } else if (op == BITOP_TYPE::BITOR) {
-            // OR
-            return lhs | rhs;
-          } else {
-            // XOR
-            return lhs ^ rhs;
-          }
-        };
-
-        generate_device_accumulator_bivariate<Torus>(
-            streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-            lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
-        lut->broadcast_lut(streams, gpu_indexes);
-      }
-      break;
-    default:
-      // Scalar OP
-      lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
-                                     params.message_modulus, num_radix_blocks,
-                                     allocate_gpu_memory, size_tracker);
-
-      for (int i = 0; i < params.message_modulus; i++) {
-        auto rhs = i;
-
-        auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
-          if (op == BITOP_TYPE::SCALAR_BITAND) {
-            // AND
-            return x & rhs;
-          } else if (op == BITOP_TYPE::SCALAR_BITOR) {
-            // OR
-            return x | rhs;
-          } else {
-            // XOR
-            return x ^ rhs;
-          }
-        };
-        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
-            lut->get_max_degree(i), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_univariate_scalar_f,
-            gpu_memory_allocated);
-        lut->broadcast_lut(streams, gpu_indexes);
-      }
-    }
-  }
-
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-    lut->release(streams, gpu_indexes, gpu_count);
-    delete lut;
-  }
-};
-
 template <typename Torus> struct int_scalar_mul_buffer {
  int_radix_params params;
  int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
@@ -5204,45 +5899,6 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
  }
 };

-template <typename Torus> struct int_sub_and_propagate {
-  int_radix_params params;
-  bool allocate_gpu_memory;
-
-  CudaRadixCiphertextFFI *neg_rhs_array;
-
-  int_sc_prop_memory<Torus> *sc_prop_mem;
-
-  int_sub_and_propagate(cudaStream_t const *streams,
-                        uint32_t const *gpu_indexes, uint32_t gpu_count,
-                        const int_radix_params params,
-                        uint32_t num_radix_blocks, uint32_t requested_flag_in,
-                        bool allocate_gpu_memory, uint64_t &size_tracker) {
-
-    this->params = params;
-    this->allocate_gpu_memory = allocate_gpu_memory;
-
-    this->sc_prop_mem = new int_sc_prop_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
-
-    this->neg_rhs_array = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-  }
-
-  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-               uint32_t gpu_count) {
-
-    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
-    delete sc_prop_mem;
-
-    release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
-                                   allocate_gpu_memory);
-    delete neg_rhs_array;
-  }
-};
-
 template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {

  int_radix_params params;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -4,6 +4,7 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer/abs.cuh"
+#include "integer/cast.cuh"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
@@ -32,6 +33,356 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
  return size_tracker;
 }

+template <typename Torus>
+__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
+    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
+    CudaRadixCiphertextFFI const *divisor, void *const *bsks,
+    uint64_t *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
+
+  // alias
+  auto radix_params = mem_ptr->params;
+  auto num_blocks = quotient->num_radix_blocks;
+
+  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder,
+                                     numerator);
+  set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                               quotient, 0, num_blocks);
+  quotient->num_radix_blocks = 0;
+  // Computes 2*d by extending and shifting
+  auto extend_2xd_f = [&](cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, uint32_t gpu_count) {
+    // d2 is allocated with num_blocks + 1; so we extend with 1.
+    host_extend_radix_with_trivial_zero_blocks_msb<Torus>(mem_ptr->d2, divisor,
+                                                          streams, gpu_indexes);
+    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->d2, 1, mem_ptr->shift_mem,
+        bsks, ksks, ms_noise_reduction_key, mem_ptr->d2->num_radix_blocks);
+  };
+
+  // Computes 3*d = 4*d - d using block shift and subtraction
+  auto extend_3xd_f = [&](cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, uint32_t gpu_count) {
+    // d1 is allocated with num_blocks + 1; so we extend with 1.
+    host_extend_radix_with_trivial_zero_blocks_msb<Torus>(mem_ptr->d1, divisor,
+                                                          streams, gpu_indexes);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          mem_ptr->d3, mem_ptr->d1, 1,
+                                          mem_ptr->d1->num_radix_blocks);
+    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                                 mem_ptr->d3, 0, 1);
+    host_sub_and_propagate_single_carry(
+        streams, gpu_indexes, gpu_count, mem_ptr->d3, mem_ptr->d1, nullptr,
+        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
+        ms_noise_reduction_key, outputFlag::FLAG_NONE, 0);
+    // trim d1 by one msb block
+    mem_ptr->d1->num_radix_blocks -= 1;
+  };
+
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
+
+  extend_2xd_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
+  extend_3xd_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
+
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+  }
+
+  for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
+    uint32_t slice_len = num_blocks - block_index;
+
+    mem_ptr->low1->num_radix_blocks = slice_len;
+    mem_ptr->low2->num_radix_blocks = slice_len;
+    mem_ptr->low3->num_radix_blocks = slice_len;
+    mem_ptr->rem->num_radix_blocks = slice_len;
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->low1, 0, slice_len,
+                                             mem_ptr->d1, 0, slice_len);
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->low2, 0, slice_len,
+                                             mem_ptr->d2, 0, slice_len);
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->low3, 0, slice_len,
+                                             mem_ptr->d3, 0, slice_len);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], mem_ptr->rem, 0, slice_len, remainder,
+        block_index, num_blocks);
+    uint32_t compute_overflow = 1;
+    uint32_t uses_input_borrow = 0;
+    auto first_indexes =
+        mem_ptr->first_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
+                                                1];
+    auto second_indexes =
+        mem_ptr
+            ->second_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
+                                              1];
+    auto scalar_indexes =
+        mem_ptr->scalars_for_overflow_sub[mem_ptr->rem->num_radix_blocks - 1];
+    auto sub_result_f = [&](cudaStream_t const *streams,
+                            uint32_t const *gpu_indexes, uint32_t gpu_count,
+                            CudaRadixCiphertextFFI *sub_result,
+                            CudaRadixCiphertextFFI *sub_overflowed,
+                            int_borrow_prop_memory<Torus> *overflow_sub_mem,
+                            CudaRadixCiphertextFFI *low) {
+      sub_result->num_radix_blocks = low->num_radix_blocks;
+      overflow_sub_mem->update_lut_indexes(streams, gpu_indexes, first_indexes,
+                                           second_indexes, scalar_indexes,
+                                           mem_ptr->rem->num_radix_blocks);
+      host_integer_overflowing_sub<uint64_t>(
+          streams, gpu_indexes, gpu_count, sub_result, mem_ptr->rem, low,
+          sub_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
+          overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
+          compute_overflow, uses_input_borrow);
+    };
+
+    auto cmp_f = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                     uint32_t gpu_count,
+                     CudaRadixCiphertextFFI *out_boolean_block,
+                     CudaRadixCiphertextFFI *comparison_blocks,
+                     CudaRadixCiphertextFFI *d,
+                     int_comparison_buffer<Torus> *comparison_buffer) {
+      CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
+      uint32_t slice_start = num_blocks - block_index;
+      uint32_t slice_end = d->num_radix_blocks;
+      as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
+      comparison_blocks->num_radix_blocks = d_msb->num_radix_blocks;
+      if (d_msb->num_radix_blocks == 0) {
+        cuda_memset_async((Torus *)out_boolean_block->ptr, 0,
+                          sizeof(Torus) *
+                              (out_boolean_block->lwe_dimension + 1),
+                          streams[0], gpu_indexes[0]);
+      } else {
+        host_compare_blocks_with_zero<Torus>(
+            streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
+            comparison_buffer, bsks, ksks, ms_noise_reduction_key,
+            d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
+        are_all_comparisons_block_true(
+            streams, gpu_indexes, gpu_count, out_boolean_block,
+            comparison_blocks, comparison_buffer, bsks, ksks,
+            ms_noise_reduction_key, comparison_blocks->num_radix_blocks);
+
+        host_negation<Torus>(
+            streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
+            (Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
+        // we calculate encoding because this block works only for
+        // message_modulus = 4 and carry_modulus = 4.
+        const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
+        host_addition_plaintext_scalar<Torus>(
+            streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
+            (Torus *)out_boolean_block->ptr, encoded_scalar,
+            radix_params.big_lwe_dimension, 1);
+      }
+      delete d_msb;
+    };
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+    sub_result_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+                 mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
+                 mem_ptr->overflow_sub_mem_1, mem_ptr->low3);
+    sub_result_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+                 mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
+                 mem_ptr->overflow_sub_mem_2, mem_ptr->low2);
+    sub_result_f(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
+                 mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
+                 mem_ptr->overflow_sub_mem_3, mem_ptr->low1);
+    cmp_f(mem_ptr->sub_streams_4, gpu_indexes, gpu_count, mem_ptr->cmp_1,
+          mem_ptr->comparison_blocks_1, mem_ptr->d3,
+          mem_ptr->comparison_buffer_1);
+    cmp_f(mem_ptr->sub_streams_5, gpu_indexes, gpu_count, mem_ptr->cmp_2,
+          mem_ptr->comparison_blocks_2, mem_ptr->d2,
+          mem_ptr->comparison_buffer_2);
+    cmp_f(mem_ptr->sub_streams_6, gpu_indexes, gpu_count, mem_ptr->cmp_3,
+          mem_ptr->comparison_blocks_3, mem_ptr->d1,
+          mem_ptr->comparison_buffer_3);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
+    }
+
+    auto r1 = mem_ptr->sub_result_3;
+    auto r2 = mem_ptr->sub_result_2;
+    auto r3 = mem_ptr->sub_result_1;
+
+    auto o1 = mem_ptr->sub_3_overflowed;
+    auto o2 = mem_ptr->sub_2_overflowed;
+    auto o3 = mem_ptr->sub_1_overflowed;
+
+    // used as a bitor
+    host_integer_radix_bitop_kb(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+                                o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
+                                bsks, ksks, ms_noise_reduction_key);
+    // used as a bitor
+    host_integer_radix_bitop_kb(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+                                o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
+                                bsks, ksks, ms_noise_reduction_key);
+    // used as a bitor
+    host_integer_radix_bitop_kb(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
+                                o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
+                                bsks, ksks, ms_noise_reduction_key);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+    }
+
+    // The cx variables tell whether the corresponding result of the subtraction
+    // should be kept, and what value the quotient block should have
+    //
+    // for c3, c0; the block values are in [0, 1]
+    // for c2, c1; the block values are in [0, 1, 2], 2 meaning true; 0,1
+    // meaning false
+
+    // c3 = !o3
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->c3, 0, 1, o3, 0, 1);
+    host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
+                         (Torus *)mem_ptr->c3->ptr,
+                         radix_params.big_lwe_dimension, 1);
+    const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
+    host_addition_plaintext_scalar<Torus>(
+        streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
+        (Torus *)mem_ptr->c3->ptr, encoded_scalar,
+        radix_params.big_lwe_dimension, 1);
+
+    // c2 = !o2 + o3
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->c2, 0, 1, o2, 0, 1);
+    host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
+                         (Torus *)mem_ptr->c2->ptr,
+                         radix_params.big_lwe_dimension, 1);
+    host_addition_plaintext_scalar<Torus>(
+        streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
+        (Torus *)mem_ptr->c2->ptr, encoded_scalar,
+        radix_params.big_lwe_dimension, 1);
+
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c2, mem_ptr->c2,
+                         o3, 1, 4, 4);
+
+    // c1 = !o1 + o2
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->c1, 0, 1, o1, 0, 1);
+    host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
+                         (Torus *)mem_ptr->c1->ptr,
+                         radix_params.big_lwe_dimension, 1);
+    host_addition_plaintext_scalar<Torus>(
+        streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
+        (Torus *)mem_ptr->c1->ptr, encoded_scalar,
+        radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c1, mem_ptr->c1,
+                         o2, 1, 4, 4);
+
+    // c0 = o1 (direct copy)
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             mem_ptr->c0, 0, 1, o1, 0, 1);
+
+    auto conditional_update = [&](cudaStream_t const *streams,
+                                  uint32_t const *gpu_indexes,
+                                  uint32_t gpu_count,
+                                  CudaRadixCiphertextFFI *cx,
+                                  CudaRadixCiphertextFFI *rx,
+                                  int_radix_lut<Torus> *lut, Torus factor) {
+      auto rx_list = to_lwe_ciphertext_list(rx);
+      host_cleartext_multiplication<Torus>(streams[0], gpu_indexes[0],
+                                           (Torus *)rx->ptr, &rx_list, factor);
+      host_add_the_same_block_to_all_blocks<Torus>(streams[0], gpu_indexes[0],
+                                                   rx, rx, cx, 4, 4);
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, rx, rx, bsks, ksks,
+          ms_noise_reduction_key, lut, rx->num_radix_blocks);
+    };
+
+    auto calculate_quotient_bits =
+        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
+            uint32_t gpu_count, CudaRadixCiphertextFFI *q,
+            CudaRadixCiphertextFFI *c, int_radix_lut<Torus> *lut) {
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
+              streams, gpu_indexes, gpu_count, q, c, bsks, ksks,
+              ms_noise_reduction_key, lut, 1);
+        };
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+
+    conditional_update(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
+                       mem_ptr->c3, r3, mem_ptr->zero_out_if_not_1_lut_1, 2);
+    conditional_update(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
+                       mem_ptr->c2, r2, mem_ptr->zero_out_if_not_2_lut_1, 3);
+    conditional_update(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
+                       mem_ptr->c1, r1, mem_ptr->zero_out_if_not_2_lut_2, 3);
+    conditional_update(mem_ptr->sub_streams_4, gpu_indexes, gpu_count,
+                       mem_ptr->c0, mem_ptr->rem,
+                       mem_ptr->zero_out_if_not_1_lut_2, 2);
+
+    calculate_quotient_bits(mem_ptr->sub_streams_5, gpu_indexes, 1, mem_ptr->q1,
+                            mem_ptr->c1, mem_ptr->quotient_lut_1);
+    calculate_quotient_bits(mem_ptr->sub_streams_6, gpu_indexes, 1, mem_ptr->q2,
+                            mem_ptr->c2, mem_ptr->quotient_lut_2);
+    calculate_quotient_bits(mem_ptr->sub_streams_7, gpu_indexes, 1, mem_ptr->q3,
+                            mem_ptr->c3, mem_ptr->quotient_lut_3);
+
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_7[j], gpu_indexes[j]);
+    }
+
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
+                         r3, mem_ptr->rem->num_radix_blocks, 4, 4);
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
+                         r2, mem_ptr->rem->num_radix_blocks, 4, 4);
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
+                         r1, mem_ptr->rem->num_radix_blocks, 4, 4);
+
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
+                         mem_ptr->q2, 1, 4, 4);
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
+                         mem_ptr->q3, 1, 4, 4);
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        mem_ptr->sub_streams_1, gpu_indexes, gpu_count, mem_ptr->rem,
+        mem_ptr->rem, bsks, ksks, ms_noise_reduction_key,
+        mem_ptr->message_extract_lut_1, mem_ptr->rem->num_radix_blocks);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        mem_ptr->sub_streams_2, gpu_indexes, gpu_count, mem_ptr->q1,
+        mem_ptr->q1, bsks, ksks, ms_noise_reduction_key,
+        mem_ptr->message_extract_lut_2, 1);
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+    }
+
+    size_t tmp_rem_size = mem_ptr->rem->num_radix_blocks;
+    mem_ptr->rem->num_radix_blocks = remainder->num_radix_blocks;
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], remainder, block_index,
+        remainder->num_radix_blocks, mem_ptr->rem, 0, tmp_rem_size);
+    mem_ptr->rem->num_radix_blocks = tmp_rem_size;
+
+    insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                                  mem_ptr->q1, quotient, 0);
+  }
+}
+
 template <typename Torus>
 __host__ void host_unsigned_integer_div_rem_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -50,6 +401,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
      remainder->lwe_dimension != divisor->lwe_dimension ||
      remainder->lwe_dimension != quotient->lwe_dimension)
    PANIC("Cuda error: input and output lwe dimension must be equal")
+
+  if (mem_ptr->params.message_modulus == 4 &&
+      mem_ptr->params.carry_modulus == 4) {
+    host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
+        streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
+        divisor, bsks, ksks, ms_noise_reduction_key, mem_ptr->div_rem_2_2_mem);
+    return;
+  }
  auto radix_params = mem_ptr->params;
  auto num_blocks = quotient->num_radix_blocks;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -7,6 +7,12 @@
 #include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"

+inline CudaLweCiphertextListFFI
+to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
+  return {.ptr = radix->ptr,
+          .num_radix_blocks = radix->num_radix_blocks,
+          .lwe_dimension = radix->lwe_dimension};
+}
 template <typename Torus>
 void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        uint32_t const gpu_index,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -105,134 +105,6 @@ const _: () = {
        ms_input_variance
    ) - 32usize];
 };
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CudaLweCiphertextListFFI {
-    pub ptr: *mut ffi::c_void,
-    pub num_radix_blocks: u32,
-    pub lwe_dimension: u32,
-}
-#[allow(clippy::unnecessary_operation, clippy::identity_op)]
-const _: () = {
-    ["Size of CudaLweCiphertextListFFI"]
-        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
-    ["Alignment of CudaLweCiphertextListFFI"]
-        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
-    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
-    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
-    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
-};
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CudaPackedGlweCiphertextListFFI {
-    pub ptr: *mut ffi::c_void,
-    pub storage_log_modulus: u32,
-    pub lwe_per_glwe: u32,
-    pub total_lwe_bodies_count: u32,
-    pub glwe_dimension: u32,
-    pub polynomial_size: u32,
-}
-#[allow(clippy::unnecessary_operation, clippy::identity_op)]
-const _: () = {
-    ["Size of CudaPackedGlweCiphertextListFFI"]
-        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
-    ["Alignment of CudaPackedGlweCiphertextListFFI"]
-        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
-};
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        compression_glwe_dimension: u32,
-        compression_polynomial_size: u32,
-        lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        num_radix_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        lwe_per_glwe: u32,
-        allocate_gpu_memory: bool,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        encryption_glwe_dimension: u32,
-        encryption_polynomial_size: u32,
-        compression_glwe_dimension: u32,
-        compression_polynomial_size: u32,
-        lwe_dimension: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        num_blocks_to_decompress: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        allocate_ms_array: bool,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_integer_compress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
-        lwe_array_in: *const CudaLweCiphertextListFFI,
-        fp_ksk: *const *mut ffi::c_void,
-        mem_ptr: *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cuda_integer_decompress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array_out: *mut CudaLweCiphertextListFFI,
-        glwe_in: *const CudaPackedGlweCiphertextListFFI,
-        indexes_array: *const u32,
-        bsks: *const *mut ffi::c_void,
-        mem_ptr: *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
 pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
@@ -367,6 +239,55 @@ const _: () = {
        divisor_has_more_bits_than_numerator
    ) - 60usize];
 };
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaLweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub num_radix_blocks: u32,
+    pub lwe_dimension: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaLweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
+    ["Alignment of CudaLweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
+};
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaPackedGlweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub storage_log_modulus: u32,
+    pub lwe_per_glwe: u32,
+    pub total_lwe_bodies_count: u32,
+    pub glwe_dimension: u32,
+    pub polynomial_size: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
+    ["Alignment of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
+};
 unsafe extern "C" {
    pub fn scratch_cuda_apply_univariate_lut_kb_64(
        streams: *const *mut ffi::c_void,
@@ -1934,6 +1855,85 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        num_radix_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        lwe_per_glwe: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        encryption_glwe_dimension: u32,
+        encryption_polynomial_size: u32,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        num_blocks_to_decompress: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        allocate_ms_array: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
+        lwe_array_in: *const CudaLweCiphertextListFFI,
+        fp_ksk: *const *mut ffi::c_void,
+        mem_ptr: *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut CudaLweCiphertextListFFI,
+        glwe_in: *const CudaPackedGlweCiphertextListFFI,
+        indexes_array: *const u32,
+        bsks: *const *mut ffi::c_void,
+        mem_ptr: *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;