print

debug
2026-01-11 15:48:20 -05:00 · 2025-07-16 11:18:32 +02:00 · 2025-07-16 11:13:44 +02:00 · 2025-07-16 11:09:29 +02:00 · 2025-07-16 11:04:00 +02:00 · 2025-07-16 10:56:45 +02:00
6 changed files with 170 additions and 28 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -305,6 +305,7 @@ template <typename Torus> struct int_radix_lut {
    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      printf("Active GPUs in int_radix_lut: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
      int8_t *gpu_pbs_buffer;
@@ -447,6 +448,7 @@ template <typename Torus> struct int_radix_lut {
    // LUT is used as a trivial encryption and must be initialized outside
    // this constructor
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_radix_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    for (uint i = 0; i < active_gpu_count; i++) {
      auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
          num_luts * lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
@@ -520,6 +522,7 @@ template <typename Torus> struct int_radix_lut {

    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_radix_lut 2: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
@@ -827,6 +830,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {

    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_noise_squash_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
@@ -2789,6 +2793,7 @@ template <typename Torus> struct int_borrow_prop_memory {
    }

    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_borrow_prop: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    sub_streams_1 =
        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
    sub_streams_2 =
@@ -2880,6 +2885,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_zero_out_if: %d, gpu count: %d\n", active_gpu_count, gpu_count);

    tmp = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -3285,6 +3291,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    active_gpu_count = get_active_gpu_count(1, gpu_count);
+      printf("Active GPUs in int_arithmetic_scalar_shift: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    // In the arithmetic shift, a PBS has to be applied to the last rotated
    // block twice: once to shift it, once to compute the padding block to be
    // copied onto all blocks to the left of the last rotated block
@@ -3920,6 +3927,7 @@ template <typename Torus> struct int_comparison_buffer {
    this->is_signed = is_signed;

    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_compar: %d, gpu count: %d\n", active_gpu_count, gpu_count);

    identity_lut_f = [](Torus x) -> Torus { return x; };

@@ -4425,6 +4433,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
+      printf("Active GPUs in int_div_rem: %d, gpu count: %d\n", active_gpu_count, gpu_count);

    this->params = params;
    shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -40,6 +40,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
                                    uint32_t num_radix_blocks) {

  cuda_set_device(gpu_index);
+  printf("GPU %d\n", gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -218,6 +219,8 @@ __host__ void is_at_least_one_comparisons_block_true(
  while (remaining_blocks > 0) {
    // Split in max_value chunks
    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    cudaDeviceSynchronize();
+    printf("Is at least one comparison block true chunks %d\n", num_chunks);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -228,6 +231,10 @@ __host__ void is_at_least_one_comparisons_block_true(
    for (int i = 0; i < num_chunks; i++) {
      uint32_t chunk_length =
          std::min(max_value, begin_remaining_blocks - i * max_value);
+      cudaDeviceSynchronize();
+      printf("chunk length %d, accumulator blocks: %d, input blocks: %d\n", chunk_length,
+             buffer->tmp_block_accumulated->num_radix_blocks,
+             mem_ptr->tmp_lwe_array_out->num_radix_blocks);
      chunk_lengths[i] = chunk_length;
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
                                   input_blocks, big_lwe_dimension,
@@ -243,6 +250,8 @@ __host__ void is_at_least_one_comparisons_block_true(

    // Applies the LUT
    if (remaining_blocks == 1) {
+        cudaDeviceSynchronize();
+        printf("Last lut\n");
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -250,6 +259,8 @@ __host__ void is_at_least_one_comparisons_block_true(
          lut, 1);
      return;
    } else {
+        cudaDeviceSynchronize();
+        printf("lut with %d blocks\n", num_chunks);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
          buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
@@ -296,6 +307,8 @@ __host__ void host_compare_blocks_with_zero(
  // Accumulator
  auto sum = lwe_array_out;

+  cudaDeviceSynchronize();
+  printf("Here in compare blocks with zero\n");
  if (num_radix_blocks == 1) {
    // Just copy
    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
@@ -305,10 +318,16 @@ __host__ void host_compare_blocks_with_zero(
    uint32_t remainder_blocks = num_radix_blocks;
    auto sum_i = (Torus *)sum->ptr;
    auto chunk = (Torus *)lwe_array_in->ptr;
+    int blocks_check = sum->num_radix_blocks;
+    cudaDeviceSynchronize();
+    printf("Here in compare blocks with zero sum %d input %d\n", sum->num_radix_blocks, lwe_array_in->num_radix_blocks);
    while (remainder_blocks > 1) {
+        cudaDeviceSynchronize();
+        printf("Here in compare blocks with zero remainder blocks %d\n", remainder_blocks);
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

+      printf("Chunk size: %d, sum_i blocks: %d, remainder blocks: %d\n", chunk_size, blocks_check, remainder_blocks);
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
                                   big_lwe_dimension, chunk_size);

@@ -318,8 +337,11 @@ __host__ void host_compare_blocks_with_zero(
      // Update operands
      chunk += (chunk_size - 1) * big_lwe_size;
      sum_i += big_lwe_size;
+      blocks_check -= 1;
    }
  }
+    cudaDeviceSynchronize();
+    printf("Here in compare blocks with zero num sum blocks: %d\n", num_sum_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -80,6 +80,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
  set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                               quotient, 0, num_blocks);

+  cudaDeviceSynchronize();
+  printf("Here 0\n");
+
  for (int i = total_bits - 1; i >= 0; i--) {
    uint32_t pos_in_block = i % num_bits_in_message;
    uint32_t msb_bit_set = total_bits - 1 - i;
@@ -89,6 +92,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // and all blocks after it are also trivial zeros
    // This number is in range 1..=num_bocks -1
    uint32_t first_trivial_block = last_non_trivial_block + 1;
+    printf("num blocks: %d, first trivial block: %d\n", num_blocks, first_trivial_block);
    reset_radix_ciphertext_blocks(interesting_remainder1, first_trivial_block);
    reset_radix_ciphertext_blocks(interesting_remainder2, first_trivial_block);
    reset_radix_ciphertext_blocks(interesting_divisor, first_trivial_block);
@@ -243,18 +247,28 @@ __host__ void host_unsigned_integer_div_rem_kb(
    for (uint j = 0; j < gpu_count; j++) {
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
+      cudaDeviceSynchronize();
+      printf("Here 1\n");
    // interesting_divisor
    trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
                                       gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 2\n");
    // divisor_ms_blocks
    trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
    // interesting_remainder1
    // numerator_block_stack
+      cudaDeviceSynchronize();
+      printf("Here 3\n");
    left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
                                      gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 4\n");
    // interesting_remainder2
    left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
                                      gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 5\n");
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -318,6 +332,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
          subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
          mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
          compute_borrow, uses_input_borrow);
+      for (uint i = 0; i < gpu_count; i++) {
+          cuda_set_device(gpu_indexes[i]);
+          cudaDeviceSynchronize();
+          printf("Synchronize gpu %d\n", i);
+          check_cuda_error(cudaGetLastError());
+      }
    };

    // fills:
@@ -326,6 +346,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
                                          uint32_t const *gpu_indexes,
                                          uint32_t gpu_count) {
      auto trivial_blocks = divisor_ms_blocks;
+      printf("Trivial blocks: %d\n", trivial_blocks->num_radix_blocks);
+        for (uint i = 0; i < gpu_count; i++) {
+            cuda_set_device(gpu_indexes[i]);
+            cudaDeviceSynchronize();
+            printf("Synchronize gpu %d\n", i);
+            check_cuda_error(cudaGetLastError());
+        }
      if (trivial_blocks->num_radix_blocks == 0) {
        set_zero_radix_ciphertext_slice_async<Torus>(
            streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 0,
@@ -341,6 +368,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
            trivial_blocks->num_radix_blocks,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);

+        cudaDeviceSynchronize();
+          printf("Before is at least one comparisons block true %d\n", mem_ptr->tmp_1->num_radix_blocks);
        is_at_least_one_comparisons_block_true<Torus>(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
@@ -370,12 +399,20 @@ __host__ void host_unsigned_integer_div_rem_kb(
    }
    // new_remainder
    // subtraction_overflowed
+      cudaDeviceSynchronize();
+      printf("Here 6 before overlfow sub\n");
    do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
    // at_least_one_upper_block_is_non_zero
+      cudaDeviceSynchronize();
+      printf("Here 7\n");
    check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
    // cleaned_merged_interesting_remainder
+      cudaDeviceSynchronize();
+      printf("Here 8\n");
    create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
                                             gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 9\n");
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -441,13 +478,21 @@ __host__ void host_unsigned_integer_div_rem_kb(
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
    // cleaned_merged_interesting_remainder
+      cudaDeviceSynchronize();
+      printf("Here 10\n");
    conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
                                                        gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 11\n");
    // new_remainder
    conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
                                                gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 12\n");
    // quotient
    set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 13\n");
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -482,10 +527,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
+    cudaDeviceSynchronize();
+    printf("Here 14\n");
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
      num_blocks);
+    cudaDeviceSynchronize();
+    printf("Here 15\n");
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -536,6 +536,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  printf("Active GPUs in lut univ: %d, gpu count: %d\n", active_gpu_count, gpu_count);
  if (active_gpu_count == 1) {
    execute_keyswitch_async<Torus>(
        streams, gpu_indexes, 1, lwe_after_ks_vec[0],
@@ -2114,12 +2115,24 @@ void host_single_borrow_propagate(
        streams[0], gpu_indexes[0], lwe_array, lwe_array, input_borrow, 1,
        message_modulus, carry_modulus);
  }
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  // Step 1
  host_compute_shifted_blocks_and_borrow_states<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array,
      mem->shifted_blocks_borrow_state_mem, bsks, ksks, ms_noise_reduction_key,
      lut_stride, num_many_lut);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
  copy_radix_ciphertext_slice_async<Torus>(
      streams[0], gpu_indexes[0], mem->overflow_block, 0, 1, borrow_states,
@@ -2131,6 +2144,12 @@ void host_single_borrow_propagate(
      mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
      num_radix_blocks, num_groups);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  auto shifted_blocks =
      (Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
@@ -2140,10 +2159,22 @@ void host_single_borrow_propagate(
                          (Torus *)prepared_blocks->ptr, shifted_blocks,
                          simulators, big_lwe_dimension, num_radix_blocks);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, prepared_blocks, message_modulus,
      carry_modulus);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    CudaRadixCiphertextFFI shifted_simulators;
    as_radix_ciphertext_slice<Torus>(
@@ -2152,6 +2183,12 @@ void host_single_borrow_propagate(
    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
                         mem->overflow_block, &shifted_simulators, 1);
  }
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  CudaRadixCiphertextFFI resolved_borrows;
  as_radix_ciphertext_slice<Torus>(
      &resolved_borrows, mem->prop_simu_group_carries_mem->resolved_carries,
@@ -2165,49 +2202,60 @@ void host_single_borrow_propagate(
                         mem->overflow_block, &resolved_borrows, 1);
  }

-  cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_stream_wait_event(mem->sub_streams_1[j], mem->incoming_events[0],
-                           gpu_indexes[j]);
-    cuda_stream_wait_event(mem->sub_streams_2[j], mem->incoming_events[0],
-                           gpu_indexes[j]);
-  }
-
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    auto borrow_flag = mem->lut_borrow_flag;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
+        streams, gpu_indexes, gpu_count, overflow_block,
        mem->overflow_block, bsks, ksks, ms_noise_reduction_key, borrow_flag,
        1);
  }
-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
-                      gpu_indexes[j]);
-  }

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  // subtract borrow and cleanup prepared blocks
  auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
  host_negation<Torus>(
-      mem->sub_streams_2[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
+      streams[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
      (Torus *)resolved_carries->ptr, big_lwe_dimension, num_groups);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
      resolved_carries, num_radix_blocks, mem->group_size);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  auto message_extract = mem->lut_message_extract;
+  printf("lut blocks: %d, call with %d\n", message_extract->num_blocks, num_radix_blocks);
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_2, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
+      streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
      bsks, ksks, ms_noise_reduction_key, message_extract, num_radix_blocks);
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }

-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j],
-                      gpu_indexes[j]);
-    cuda_stream_wait_event(streams[0], mem->outgoing_events1[j],
-                           gpu_indexes[0]);
-    cuda_stream_wait_event(streams[0], mem->outgoing_events2[j],
-                           gpu_indexes[0]);
-  }
 }

 /// num_radix_blocks corresponds to the number of blocks on which to apply the
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -165,10 +165,22 @@ __host__ void host_integer_overflowing_sub(
      stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
      radix_params.message_modulus, radix_params.carry_modulus);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  host_single_borrow_propagate<Torus>(
      streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
      ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -39,10 +39,12 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
 }

 int get_active_gpu_count(int num_inputs, int gpu_count) {
-  int active_gpu_count = gpu_count;
-  if (gpu_count > num_inputs) {
-    active_gpu_count = num_inputs;
-  }
+  int threshold_number_of_inputs = 10;
+  int ceil_div_inputs = std::max(1, (num_inputs + threshold_number_of_inputs - 1) /
+                        threshold_number_of_inputs);
+  printf("ceil div inputs: %dn, gpu_count: %d\n", ceil_div_inputs, gpu_count);
+  int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
+  printf("active gpus: %d\n", active_gpu_count);
  return active_gpu_count;
 }
Author	SHA1	Message	Date
Agnes Leroy	ba99b2feda	print	2025-07-16 11:18:32 +02:00
Agnes Leroy	b0dfd5609c	debug	2025-07-16 11:13:44 +02:00
Agnes Leroy	5083efcdc9	debug	2025-07-16 11:09:29 +02:00
Agnes Leroy	2d676fe0cb	debug	2025-07-16 11:04:00 +02:00
Agnes Leroy	ec18468ab4	debug	2025-07-16 10:56:45 +02:00
Agnes Leroy	1541281769	debug	2025-07-16 10:52:44 +02:00
Agnes Leroy	7e15535bda	print	2025-07-16 10:46:10 +02:00
Agnes Leroy	35ee34da1b	print	2025-07-15 16:22:43 +02:00
Agnes Leroy	589528ffd4	print	2025-07-15 16:12:18 +02:00
Agnes Leroy	dfcc37ec98	lower threshold for testing	2025-07-15 15:55:20 +02:00
Agnes Leroy	0e56b47903	print	2025-07-15 15:54:15 +02:00
Agnes Leroy	9fac4b81d1	print	2025-07-15 12:03:22 +02:00
Agnes Leroy	c26637e6fa	debug	2025-07-15 11:48:17 +02:00
Agnes Leroy	3bf603ae1e	chore(gpu): change multi gpu logic	2025-07-15 11:31:04 +02:00