From 5f253b1d86b12c190de694c9cd2a99de281c1577 Mon Sep 17 00:00:00 2001
From: bbarbakadze <beka.barbakadze@zama.ai>
Date: Wed, 24 Sep 2025 18:22:00 +0000
Subject: [PATCH] refactor div_2_2 memory

---
 .../cuda/include/integer/integer_utilities.h  | 260 +++++++-----------
 1 file changed, 93 insertions(+), 167 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
index 6fd663bcd..9e9356abe 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -4307,12 +4307,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
 
   // sub streams
   cudaStream_t *sub_streams_1;
-  cudaStream_t *sub_streams_2;
-  cudaStream_t *sub_streams_3;
-  cudaStream_t *sub_streams_4;
-  cudaStream_t *sub_streams_5;
-  cudaStream_t *sub_streams_6;
-  cudaStream_t *sub_streams_7;
 
   // temporary device buffers
   CudaRadixCiphertextFFI *d1;                  // num_blocks + 1
@@ -4347,9 +4341,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
   CudaRadixCiphertextFFI *cmp_2;               // boolean block
   CudaRadixCiphertextFFI *cmp_3;               // boolean block
   CudaRadixCiphertextFFI *c0;                  // single block
-  // CudaRadixCiphertextFFI *c1;                  // single block
-  // CudaRadixCiphertextFFI *c2;                  // single block
-  // CudaRadixCiphertextFFI *c3;                  // single block
   CudaRadixCiphertextFFI *q1; // single block
   CudaRadixCiphertextFFI *q2; // single block
   CudaRadixCiphertextFFI *q3; // single block
@@ -4379,7 +4370,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                               uint32_t const *gpu_indexes, uint32_t num_blocks,
                               bool allocate_gpu_memory,
                               uint64_t &size_tracker) {
-
     // more than one block temporary arrays
     tmp_gpu_0 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
@@ -4421,175 +4411,109 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     tmp_gpu_1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], tmp_gpu_1, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     d2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], d2, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     low2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], low2, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     rem2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], rem2, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     divisor_gpu_1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], divisor_gpu_1, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     remainder_gpu_1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], remainder_gpu_1, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     sub_result_2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], sub_result_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     sub_2_overflowed = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], sub_2_overflowed, 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     comparison_blocks_2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], comparison_blocks_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     cmp_2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], cmp_2, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     q2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[1], gpu_indexes[1], q2, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
 
     tmp_gpu_2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], tmp_gpu_2, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     d1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], d1, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     low1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], low1, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     rem1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], rem1, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     divisor_gpu_2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], divisor_gpu_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     remainder_gpu_2 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], remainder_gpu_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     sub_result_3 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], sub_result_3, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     sub_3_overflowed = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], sub_3_overflowed, 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     comparison_blocks_3 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], comparison_blocks_3, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     cmp_3 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], cmp_3, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     q1 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[2], gpu_indexes[2], q1, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
 
     tmp_gpu_3 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[3], gpu_indexes[3], tmp_gpu_3, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     rem0 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[3], gpu_indexes[3], rem0, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
     remainder_gpu_3 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[3], gpu_indexes[3], remainder_gpu_3, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
     c0 = new CudaRadixCiphertextFFI;
     create_zero_radix_ciphertext_async<Torus>(
         streams[3], gpu_indexes[3], c0, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-
-    // comparison_blocks_1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
-    //     params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    // comparison_blocks_2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
-    //     params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    // comparison_blocks_3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
-    //     params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    // // boolean blocks or single block temporary arrays
-    // cmp_1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // cmp_2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // cmp_3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c0 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // q1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // q2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // q3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
   }
 
   // initialize lookup tables for div_rem_2_2 operation
@@ -4604,15 +4528,15 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
 
     zero_out_if_not_2_lut_1 =
         new int_radix_lut<Torus>(&streams[1], &gpu_indexes[1], 1, params, 1,
-                                 num_blocks, allocate_gpu_memory, size_tracker);
+                                 num_blocks, allocate_gpu_memory, tmp_size_tracker);
 
     zero_out_if_not_2_lut_2 =
         new int_radix_lut<Torus>(&streams[2], &gpu_indexes[2], 1, params, 1,
-                                 num_blocks, allocate_gpu_memory, size_tracker);
+                                 num_blocks, allocate_gpu_memory, tmp_size_tracker);
 
     zero_out_if_not_1_lut_2 =
         new int_radix_lut<Torus>(&streams[3], &gpu_indexes[3], 1, params, 1,
-                                 num_blocks, allocate_gpu_memory, size_tracker);
+                                 num_blocks, allocate_gpu_memory, tmp_size_tracker);
 
     auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
       Torus block = x / 2;
@@ -4652,10 +4576,10 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
 
     quotient_lut_1 =
         new int_radix_lut<Torus>(&streams[2], &gpu_indexes[2], 1, params, 1, 1,
-                                 allocate_gpu_memory, size_tracker);
+                                 allocate_gpu_memory, tmp_size_tracker);
     quotient_lut_2 =
         new int_radix_lut<Torus>(&streams[1], &gpu_indexes[1], 1, params, 1, 1,
-                                 allocate_gpu_memory, size_tracker);
+                                 allocate_gpu_memory, tmp_size_tracker);
     quotient_lut_3 =
         new int_radix_lut<Torus>(&streams[0], &gpu_indexes[0], 1, params, 1, 1,
                                  allocate_gpu_memory, size_tracker);
@@ -4719,7 +4643,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                   uint64_t &size_tracker) {
     gpu_memory_allocated = allocate_gpu_memory;
 
-    // printf("gpu_count: %d\n", gpu_count);
     if (gpu_count < 4) {
       PANIC("GPU count should be greater than 4m when using div_rem_2_2");
     }
@@ -4732,7 +4655,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
 
     shift_mem = new int_logical_scalar_shift_buffer<Torus>(
         &streams[1], &gpu_indexes[1], 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
-        params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
+        params, 2 * num_blocks, allocate_gpu_memory, tmp_size_tracker);
 
     uint32_t compute_overflow = 1;
     overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
@@ -4740,10 +4663,10 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
         allocate_gpu_memory, size_tracker);
     overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
         &streams[1], &gpu_indexes[1], 1, params, num_blocks, compute_overflow,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
     overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
         &streams[2], &gpu_indexes[2], 1, params, num_blocks, compute_overflow,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
     uint32_t group_size = overflow_sub_mem_1->group_size;
     bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
                        ->use_sequential_algorithm_to_resolve_group_carries;
@@ -4773,19 +4696,19 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
         num_blocks, false, allocate_gpu_memory, size_tracker);
     comparison_buffer_2 = new int_comparison_buffer<Torus>(
         &streams[1], &gpu_indexes[1], 1, COMPARISON_TYPE::EQ, params,
-        num_blocks, false, allocate_gpu_memory, size_tracker);
+        num_blocks, false, allocate_gpu_memory, tmp_size_tracker);
     comparison_buffer_3 = new int_comparison_buffer<Torus>(
         &streams[2], &gpu_indexes[2], 1, COMPARISON_TYPE::EQ, params,
-        num_blocks, false, allocate_gpu_memory, size_tracker);
+        num_blocks, false, allocate_gpu_memory, tmp_size_tracker);
     bitor_mem_1 = new int_bitop_buffer<Torus>(
         &streams[0], &gpu_indexes[0], 1, BITOP_TYPE::BITOR, params, num_blocks,
         allocate_gpu_memory, size_tracker);
     bitor_mem_2 = new int_bitop_buffer<Torus>(
         &streams[1], &gpu_indexes[1], 1, BITOP_TYPE::BITOR, params, num_blocks,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
     bitor_mem_3 = new int_bitop_buffer<Torus>(
         &streams[2], &gpu_indexes[2], 1, BITOP_TYPE::BITOR, params, num_blocks,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
 
     init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
                        allocate_gpu_memory, size_tracker);
@@ -4793,25 +4716,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                            allocate_gpu_memory, size_tracker);
 
     sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    // sub_streams_3 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_4 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_5 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_6 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_7 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
     for (uint j = 0; j < gpu_count; j++) {
       sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
-      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
     }
   }
 
@@ -4972,16 +4878,27 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     bitor_mem_3->release(&streams[2], &gpu_indexes[2], 1);
 
     delete sub_and_propagate_mem;
+    sub_and_propagate_mem = nullptr;
     delete shift_mem;
+    shift_mem = nullptr;
     delete overflow_sub_mem_1;
+    overflow_sub_mem_1 = nullptr;
     delete overflow_sub_mem_2;
+    overflow_sub_mem_2 = nullptr;
     delete overflow_sub_mem_3;
+    overflow_sub_mem_3 = nullptr;
     delete comparison_buffer_1;
+    comparison_buffer_1 = nullptr;
     delete comparison_buffer_2;
+    comparison_buffer_2 = nullptr;
     delete comparison_buffer_3;
+    comparison_buffer_3 = nullptr;
     delete bitor_mem_1;
+    bitor_mem_1 = nullptr;
     delete bitor_mem_2;
+    bitor_mem_2 = nullptr;
     delete bitor_mem_3;
+    bitor_mem_3 = nullptr;
 
     // release and delete lut objects
     message_extract_lut_1->release(streams, gpu_indexes, gpu_count);
@@ -4995,14 +4912,23 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     quotient_lut_3->release(&streams[0], &gpu_indexes[0], gpu_count);
 
     delete message_extract_lut_1;
+    message_extract_lut_1 = nullptr;
     delete message_extract_lut_2;
+    message_extract_lut_2 = nullptr;
     delete zero_out_if_not_1_lut_1;
+    zero_out_if_not_1_lut_1 = nullptr;
     delete zero_out_if_not_1_lut_2;
+    zero_out_if_not_1_lut_2 = nullptr;
     delete zero_out_if_not_2_lut_1;
+    zero_out_if_not_2_lut_1 = nullptr;
     delete zero_out_if_not_2_lut_2;
+    zero_out_if_not_2_lut_2 = nullptr;
     delete quotient_lut_1;
+    quotient_lut_1 = nullptr;
     delete quotient_lut_2;
+    quotient_lut_2 = nullptr;
     delete quotient_lut_3;
+    quotient_lut_3 = nullptr;
 
     // release and delete temporary buffers
     release_radix_ciphertext_async(streams[2], gpu_indexes[2], d1,
@@ -5076,76 +5002,76 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     release_radix_ciphertext_async(streams[0], gpu_indexes[0], q3,
                                    gpu_memory_allocated);
 
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0],
-    //                                comparison_blocks_1,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0],
-    //                                comparison_blocks_2,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0],
-    //                                comparison_blocks_3,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_1,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_2,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_3,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c0,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c1,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c2,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c3,
-    //                                gpu_memory_allocated);
-
     delete d1;
+    d1 = nullptr;
     delete d2;
+    d2 = nullptr;
     delete d3;
+    d3 = nullptr;
     delete low1;
+    low1 = nullptr;
     delete low2;
+    low2 = nullptr;
     delete low3;
+    low3 = nullptr;
     delete rem0;
+    rem0 = nullptr;
     delete rem1;
+    rem1 = nullptr;
     delete rem2;
+    rem2 = nullptr;
     delete rem3;
+    rem3 = nullptr;
     delete sub_result_1;
+    sub_result_1 = nullptr;
     delete sub_result_2;
+    sub_result_2 = nullptr;
     delete sub_result_3;
+    sub_result_3 = nullptr;
     delete sub_1_overflowed;
+    sub_1_overflowed = nullptr;
     delete sub_2_overflowed;
+    sub_2_overflowed = nullptr;
     delete sub_3_overflowed;
+    sub_3_overflowed = nullptr;
     delete tmp_gpu_0;
+    tmp_gpu_0 = nullptr;
     delete tmp_gpu_1;
+    tmp_gpu_1 = nullptr;
     delete tmp_gpu_2;
+    tmp_gpu_2 = nullptr;
     delete tmp_gpu_3;
+    tmp_gpu_3 = nullptr;
     delete divisor_gpu_1;
+    divisor_gpu_1 = nullptr;
     delete divisor_gpu_2;
+    divisor_gpu_2 = nullptr;
     delete remainder_gpu_1;
+    remainder_gpu_1 = nullptr;
     delete remainder_gpu_2;
+    remainder_gpu_2 = nullptr;
     delete remainder_gpu_3;
+    remainder_gpu_3 = nullptr;
     delete comparison_blocks_1;
+    comparison_blocks_1 = nullptr;
     delete comparison_blocks_2;
+    comparison_blocks_2 = nullptr;
     delete comparison_blocks_3;
+    comparison_blocks_3 = nullptr;
     delete cmp_1;
+    cmp_1 = nullptr;
     delete cmp_2;
+    cmp_2 = nullptr;
     delete cmp_3;
+    cmp_3 = nullptr;
     delete c0;
+    c0 = nullptr;
     delete q1;
+    q1 = nullptr;
     delete q2;
+    q2 = nullptr;
     delete q3;
-
-    // delete comparison_blocks_1;
-    // delete comparison_blocks_2;
-    // delete comparison_blocks_3;
-    // delete cmp_1;
-    // delete cmp_2;
-    // delete cmp_3;
-    // delete c0;
-    // delete c1;
-    // delete c2;
-    // delete c3;
+    q3 = nullptr;
 
     for (int i = 0; i < max_indexes_to_erase; i++) {
       cuda_drop_with_size_tracking_async(