refactor div_2_2 memory

2026-01-08 22:28:01 -05:00 · 2025-09-24 18:22:00 +00:00
parent 657f449454
commit 5f253b1d86
1 changed files with 93 additions and 167 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -4307,12 +4307,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {

  // sub streams
  cudaStream_t *sub_streams_1;
-  cudaStream_t *sub_streams_2;
-  cudaStream_t *sub_streams_3;
-  cudaStream_t *sub_streams_4;
-  cudaStream_t *sub_streams_5;
-  cudaStream_t *sub_streams_6;
-  cudaStream_t *sub_streams_7;

  // temporary device buffers
  CudaRadixCiphertextFFI *d1;                  // num_blocks + 1
@@ -4347,9 +4341,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
  CudaRadixCiphertextFFI *cmp_2;               // boolean block
  CudaRadixCiphertextFFI *cmp_3;               // boolean block
  CudaRadixCiphertextFFI *c0;                  // single block
-  // CudaRadixCiphertextFFI *c1;                  // single block
-  // CudaRadixCiphertextFFI *c2;                  // single block
-  // CudaRadixCiphertextFFI *c3;                  // single block
  CudaRadixCiphertextFFI *q1; // single block
  CudaRadixCiphertextFFI *q2; // single block
  CudaRadixCiphertextFFI *q3; // single block
@@ -4379,7 +4370,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                              uint32_t const *gpu_indexes, uint32_t num_blocks,
                              bool allocate_gpu_memory,
                              uint64_t &size_tracker) {
-
    // more than one block temporary arrays
    tmp_gpu_0 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -4421,175 +4411,109 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    tmp_gpu_1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], tmp_gpu_1, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    d2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], d2, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    low2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], low2, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    rem2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], rem2, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    divisor_gpu_1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], divisor_gpu_1, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    remainder_gpu_1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], remainder_gpu_1, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    sub_result_2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], sub_result_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    sub_2_overflowed = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], sub_2_overflowed, 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    comparison_blocks_2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], comparison_blocks_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    cmp_2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], cmp_2, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    q2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[1], gpu_indexes[1], q2, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);

    tmp_gpu_2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], tmp_gpu_2, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    d1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], d1, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    low1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], low1, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    rem1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], rem1, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    divisor_gpu_2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], divisor_gpu_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    remainder_gpu_2 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], remainder_gpu_2, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    sub_result_3 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], sub_result_3, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    sub_3_overflowed = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], sub_3_overflowed, 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    comparison_blocks_3 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], comparison_blocks_3, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    cmp_3 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], cmp_3, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    q1 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[2], gpu_indexes[2], q1, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);

    tmp_gpu_3 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[3], gpu_indexes[3], tmp_gpu_3, num_blocks + 1,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    rem0 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[3], gpu_indexes[3], rem0, num_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
    remainder_gpu_3 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[3], gpu_indexes[3], remainder_gpu_3, num_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        params.big_lwe_dimension, tmp_size_tracker, allocate_gpu_memory);
    c0 = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[3], gpu_indexes[3], c0, 1, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-
-    // comparison_blocks_1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
-    //     params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    // comparison_blocks_2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
-    //     params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    // comparison_blocks_3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
-    //     params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    // // boolean blocks or single block temporary arrays
-    // cmp_1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // cmp_2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // cmp_3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c0 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // c3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // q1 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // q2 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
-
-    // q3 = new CudaRadixCiphertextFFI;
-    // create_zero_radix_ciphertext_async<Torus>(
-    //     streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
-    //     size_tracker, allocate_gpu_memory);
+        tmp_size_tracker, allocate_gpu_memory);
  }

  // initialize lookup tables for div_rem_2_2 operation
@@ -4604,15 +4528,15 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {

    zero_out_if_not_2_lut_1 =
        new int_radix_lut<Torus>(&streams[1], &gpu_indexes[1], 1, params, 1,
-                                 num_blocks, allocate_gpu_memory, size_tracker);
+                                 num_blocks, allocate_gpu_memory, tmp_size_tracker);

    zero_out_if_not_2_lut_2 =
        new int_radix_lut<Torus>(&streams[2], &gpu_indexes[2], 1, params, 1,
-                                 num_blocks, allocate_gpu_memory, size_tracker);
+                                 num_blocks, allocate_gpu_memory, tmp_size_tracker);

    zero_out_if_not_1_lut_2 =
        new int_radix_lut<Torus>(&streams[3], &gpu_indexes[3], 1, params, 1,
-                                 num_blocks, allocate_gpu_memory, size_tracker);
+                                 num_blocks, allocate_gpu_memory, tmp_size_tracker);

    auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
      Torus block = x / 2;
@@ -4652,10 +4576,10 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {

    quotient_lut_1 =
        new int_radix_lut<Torus>(&streams[2], &gpu_indexes[2], 1, params, 1, 1,
-                                 allocate_gpu_memory, size_tracker);
+                                 allocate_gpu_memory, tmp_size_tracker);
    quotient_lut_2 =
        new int_radix_lut<Torus>(&streams[1], &gpu_indexes[1], 1, params, 1, 1,
-                                 allocate_gpu_memory, size_tracker);
+                                 allocate_gpu_memory, tmp_size_tracker);
    quotient_lut_3 =
        new int_radix_lut<Torus>(&streams[0], &gpu_indexes[0], 1, params, 1, 1,
                                 allocate_gpu_memory, size_tracker);
@@ -4719,7 +4643,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                  uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;

-    // printf("gpu_count: %d\n", gpu_count);
    if (gpu_count < 4) {
      PANIC("GPU count should be greater than 4m when using div_rem_2_2");
    }
@@ -4732,7 +4655,7 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {

    shift_mem = new int_logical_scalar_shift_buffer<Torus>(
        &streams[1], &gpu_indexes[1], 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
-        params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
+        params, 2 * num_blocks, allocate_gpu_memory, tmp_size_tracker);

    uint32_t compute_overflow = 1;
    overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
@@ -4740,10 +4663,10 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
        allocate_gpu_memory, size_tracker);
    overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
        &streams[1], &gpu_indexes[1], 1, params, num_blocks, compute_overflow,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
    overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
        &streams[2], &gpu_indexes[2], 1, params, num_blocks, compute_overflow,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
    uint32_t group_size = overflow_sub_mem_1->group_size;
    bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
                       ->use_sequential_algorithm_to_resolve_group_carries;
@@ -4773,19 +4696,19 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
        num_blocks, false, allocate_gpu_memory, size_tracker);
    comparison_buffer_2 = new int_comparison_buffer<Torus>(
        &streams[1], &gpu_indexes[1], 1, COMPARISON_TYPE::EQ, params,
-        num_blocks, false, allocate_gpu_memory, size_tracker);
+        num_blocks, false, allocate_gpu_memory, tmp_size_tracker);
    comparison_buffer_3 = new int_comparison_buffer<Torus>(
        &streams[2], &gpu_indexes[2], 1, COMPARISON_TYPE::EQ, params,
-        num_blocks, false, allocate_gpu_memory, size_tracker);
+        num_blocks, false, allocate_gpu_memory, tmp_size_tracker);
    bitor_mem_1 = new int_bitop_buffer<Torus>(
        &streams[0], &gpu_indexes[0], 1, BITOP_TYPE::BITOR, params, num_blocks,
        allocate_gpu_memory, size_tracker);
    bitor_mem_2 = new int_bitop_buffer<Torus>(
        &streams[1], &gpu_indexes[1], 1, BITOP_TYPE::BITOR, params, num_blocks,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);
    bitor_mem_3 = new int_bitop_buffer<Torus>(
        &streams[2], &gpu_indexes[2], 1, BITOP_TYPE::BITOR, params, num_blocks,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, tmp_size_tracker);

    init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
                       allocate_gpu_memory, size_tracker);
@@ -4793,25 +4716,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                           allocate_gpu_memory, size_tracker);

    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    // sub_streams_3 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_4 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_5 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_6 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    // sub_streams_7 =
-    //     (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
    for (uint j = 0; j < gpu_count; j++) {
      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
-      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
-      // sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
    }
  }

@@ -4972,16 +4878,27 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    bitor_mem_3->release(&streams[2], &gpu_indexes[2], 1);

    delete sub_and_propagate_mem;
+    sub_and_propagate_mem = nullptr;
    delete shift_mem;
+    shift_mem = nullptr;
    delete overflow_sub_mem_1;
+    overflow_sub_mem_1 = nullptr;
    delete overflow_sub_mem_2;
+    overflow_sub_mem_2 = nullptr;
    delete overflow_sub_mem_3;
+    overflow_sub_mem_3 = nullptr;
    delete comparison_buffer_1;
+    comparison_buffer_1 = nullptr;
    delete comparison_buffer_2;
+    comparison_buffer_2 = nullptr;
    delete comparison_buffer_3;
+    comparison_buffer_3 = nullptr;
    delete bitor_mem_1;
+    bitor_mem_1 = nullptr;
    delete bitor_mem_2;
+    bitor_mem_2 = nullptr;
    delete bitor_mem_3;
+    bitor_mem_3 = nullptr;

    // release and delete lut objects
    message_extract_lut_1->release(streams, gpu_indexes, gpu_count);
@@ -4995,14 +4912,23 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    quotient_lut_3->release(&streams[0], &gpu_indexes[0], gpu_count);

    delete message_extract_lut_1;
+    message_extract_lut_1 = nullptr;
    delete message_extract_lut_2;
+    message_extract_lut_2 = nullptr;
    delete zero_out_if_not_1_lut_1;
+    zero_out_if_not_1_lut_1 = nullptr;
    delete zero_out_if_not_1_lut_2;
+    zero_out_if_not_1_lut_2 = nullptr;
    delete zero_out_if_not_2_lut_1;
+    zero_out_if_not_2_lut_1 = nullptr;
    delete zero_out_if_not_2_lut_2;
+    zero_out_if_not_2_lut_2 = nullptr;
    delete quotient_lut_1;
+    quotient_lut_1 = nullptr;
    delete quotient_lut_2;
+    quotient_lut_2 = nullptr;
    delete quotient_lut_3;
+    quotient_lut_3 = nullptr;

    // release and delete temporary buffers
    release_radix_ciphertext_async(streams[2], gpu_indexes[2], d1,
@@ -5076,76 +5002,76 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    release_radix_ciphertext_async(streams[0], gpu_indexes[0], q3,
                                   gpu_memory_allocated);

-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0],
-    //                                comparison_blocks_1,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0],
-    //                                comparison_blocks_2,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0],
-    //                                comparison_blocks_3,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_1,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_2,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_3,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c0,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c1,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c2,
-    //                                gpu_memory_allocated);
-    // release_radix_ciphertext_async(streams[0], gpu_indexes[0], c3,
-    //                                gpu_memory_allocated);
-
    delete d1;
+    d1 = nullptr;
    delete d2;
+    d2 = nullptr;
    delete d3;
+    d3 = nullptr;
    delete low1;
+    low1 = nullptr;
    delete low2;
+    low2 = nullptr;
    delete low3;
+    low3 = nullptr;
    delete rem0;
+    rem0 = nullptr;
    delete rem1;
+    rem1 = nullptr;
    delete rem2;
+    rem2 = nullptr;
    delete rem3;
+    rem3 = nullptr;
    delete sub_result_1;
+    sub_result_1 = nullptr;
    delete sub_result_2;
+    sub_result_2 = nullptr;
    delete sub_result_3;
+    sub_result_3 = nullptr;
    delete sub_1_overflowed;
+    sub_1_overflowed = nullptr;
    delete sub_2_overflowed;
+    sub_2_overflowed = nullptr;
    delete sub_3_overflowed;
+    sub_3_overflowed = nullptr;
    delete tmp_gpu_0;
+    tmp_gpu_0 = nullptr;
    delete tmp_gpu_1;
+    tmp_gpu_1 = nullptr;
    delete tmp_gpu_2;
+    tmp_gpu_2 = nullptr;
    delete tmp_gpu_3;
+    tmp_gpu_3 = nullptr;
    delete divisor_gpu_1;
+    divisor_gpu_1 = nullptr;
    delete divisor_gpu_2;
+    divisor_gpu_2 = nullptr;
    delete remainder_gpu_1;
+    remainder_gpu_1 = nullptr;
    delete remainder_gpu_2;
+    remainder_gpu_2 = nullptr;
    delete remainder_gpu_3;
+    remainder_gpu_3 = nullptr;
    delete comparison_blocks_1;
+    comparison_blocks_1 = nullptr;
    delete comparison_blocks_2;
+    comparison_blocks_2 = nullptr;
    delete comparison_blocks_3;
+    comparison_blocks_3 = nullptr;
    delete cmp_1;
+    cmp_1 = nullptr;
    delete cmp_2;
+    cmp_2 = nullptr;
    delete cmp_3;
+    cmp_3 = nullptr;
    delete c0;
+    c0 = nullptr;
    delete q1;
+    q1 = nullptr;
    delete q2;
+    q2 = nullptr;
    delete q3;
-
-    // delete comparison_blocks_1;
-    // delete comparison_blocks_2;
-    // delete comparison_blocks_3;
-    // delete cmp_1;
-    // delete cmp_2;
-    // delete cmp_3;
-    // delete c0;
-    // delete c1;
-    // delete c2;
-    // delete c3;
+    q3 = nullptr;

    for (int i = 0; i < max_indexes_to_erase; i++) {
      cuda_drop_with_size_tracking_async(