chore(gpu): change active gpu count logic

2026-01-11 07:38:08 -05:00 · 2025-07-16 18:24:01 +02:00
5 changed files with 30 additions and 30 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -6,6 +6,7 @@

 extern std::mutex m;
 extern bool p2p_enabled;
+extern const int THRESHOLD_MULTI_GPU;

 extern "C" {
 int32_t cuda_setup_multi_gpu(int device_0_id);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -308,8 +308,9 @@ template <typename Torus> struct int_radix_lut {
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      auto num_blocks_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));

      uint64_t size = 0;
      execute_scratch_pbs<Torus>(
@@ -524,8 +525,9 @@ template <typename Torus> struct int_radix_lut {
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      auto num_blocks_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));

      uint64_t size = 0;
      execute_scratch_pbs<Torus>(
@@ -830,8 +832,9 @@ template <typename InputTorus> struct int_noise_squashing_lut {
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
-      auto num_radix_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      auto num_radix_blocks_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
      int8_t *gpu_pbs_buffer;
      uint64_t size = 0;
      execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
@@ -4957,7 +4960,6 @@ template <typename Torus> struct int_div_rem_memory {
  // sub streams
  cudaStream_t *sub_streams_1;
  cudaStream_t *sub_streams_2;
-  cudaStream_t *sub_streams_3;

  // temporary device buffers
  CudaRadixCiphertextFFI *positive_numerator;
@@ -4973,7 +4975,7 @@ template <typename Torus> struct int_div_rem_memory {
                     bool allocate_gpu_memory, uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    this->active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
+    this->active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
    this->params = params;
    this->is_signed = is_signed;

@@ -5038,16 +5040,11 @@ template <typename Torus> struct int_div_rem_memory {
          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

      // init sub streams
-      sub_streams_1 =
-          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-      sub_streams_2 =
-          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-      sub_streams_3 =
-          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
+      sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < gpu_count; j++) {
        sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
        sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
-        sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
      }

      // init lookup tables
@@ -5111,14 +5108,12 @@ template <typename Torus> struct int_div_rem_memory {
      delete compare_signed_bits_lut;

      // release sub streams
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
        cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
        cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
-        cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
      }
      free(sub_streams_1);
      free(sub_streams_2);
-      free(sub_streams_3);

      // delete temporary buffers
      delete positive_numerator;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -530,11 +530,13 @@ __host__ void host_integer_div_rem_kb(
    }

    host_integer_abs_kb<Torus>(
-        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, positive_numerator,
-        bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_1, true);
-    host_integer_abs_kb<Torus>(
-        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, positive_divisor,
-        bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_2, true);
+        int_mem_ptr->sub_streams_1, gpu_indexes, int_mem_ptr->active_gpu_count,
+        positive_numerator, bsks, ksks, ms_noise_reduction_key,
+        int_mem_ptr->abs_mem_1, true);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
+                               int_mem_ptr->active_gpu_count, positive_divisor,
+                               bsks, ksks, ms_noise_reduction_key,
+                               int_mem_ptr->abs_mem_2, true);
    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -5,6 +5,7 @@

 std::mutex m;
 bool p2p_enabled = false;
+const int THRESHOLD_MULTI_GPU = 12;

 // Enable bidirectional p2p access between all available GPUs and device_0_id
 int32_t cuda_setup_multi_gpu(int device_0_id) {
@@ -39,10 +40,9 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
 }

 int get_active_gpu_count(int num_inputs, int gpu_count) {
-  int active_gpu_count = gpu_count;
-  if (gpu_count > num_inputs) {
-    active_gpu_count = num_inputs;
-  }
+  int ceil_div_inputs =
+      std::max(1, (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
+  int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
  return active_gpu_count;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -51,7 +51,8 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
    uint64_t size_tracker_on_gpu_i = 0;
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = std::max(
+        THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
        size_tracker_on_gpu_i, allocate_gpu_memory);
@@ -80,7 +81,8 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
    uint64_t size_tracker = 0;
-    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto inputs_on_gpu = std::max(
+        THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(num_inputs, i, gpu_count));
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
        gpu_indexes[i], size_tracker, allocate_gpu_memory);