feat(gpu): improve communication scheme

2026-01-09 14:47:56 -05:00 · 2025-08-21 18:46:30 +02:00
parent e3686ed4ba
commit 88c3df8331
6 changed files with 195 additions and 122 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -324,6 +324,10 @@ template <typename Torus> struct int_radix_lut {
  uint32_t *gpu_indexes;
  bool gpu_memory_allocated;

+  cudaEvent_t event_scatter_in;
+  cudaEvent_t event_scatter_out[8];
+  cudaEvent_t event_broadcast;
+
  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                uint32_t num_radix_blocks, bool allocate_gpu_memory,
@@ -363,6 +367,15 @@ template <typename Torus> struct int_radix_lut {
      buffer.push_back(gpu_pbs_buffer);
    }

+    // if(active_gpu_count > 1){
+    event_scatter_in = cuda_create_event(gpu_indexes[0]);
+    event_broadcast = cuda_create_event(gpu_indexes[0]);
+
+    for (int i = 0; i < active_gpu_count; i++) {
+      event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
+    }
+    //}
+
    // Allocate LUT
    // LUT is used as a trivial encryption and must be initialized outside
    // this constructor
@@ -579,7 +592,14 @@ template <typename Torus> struct int_radix_lut {
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      buffer.push_back(gpu_pbs_buffer);
    }
+    // if(active_gpu_count > 1){
+    event_scatter_in = cuda_create_event(gpu_indexes[0]);
+    event_broadcast = cuda_create_event(gpu_indexes[0]);

+    for (int i = 0; i < active_gpu_count; i++) {
+      event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
+    }
+    //}
    // Allocate LUT
    // LUT is used as a trivial encryption and must be initialized outside
    // this constructor
@@ -718,9 +738,10 @@ template <typename Torus> struct int_radix_lut {
    auto src_lut = lut_vec[0];
    auto src_lut_indexes = lut_indexes_vec[0];

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      if (gpu_indexes[i] != gpu_indexes[0]) {
+        cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
        auto dst_lut = lut_vec[i];
        auto dst_lut_indexes = lut_indexes_vec[i];
        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
@@ -745,7 +766,13 @@ template <typename Torus> struct int_radix_lut {
      cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
                                         gpu_indexes[i], gpu_memory_allocated);
    }
-
+    // if( active_gpu_count > 1) {
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
+    }
+    cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
+    cuda_event_destroy(event_broadcast, gpu_indexes[0]);
+    //}
    cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
@@ -1614,6 +1641,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->allocated_luts_message_carry = false;
    this->reduce_degrees_for_single_carry_propagation =
        reduce_degrees_for_single_carry_propagation;
+
    setup_index_buffers(streams, gpu_indexes, size_tracker);
    // because we setup_lut in host function for sum_ciphertexts to save memory
    // the size_tracker is topped up here to have a max bound on the used memory
@@ -1661,6 +1689,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->current_blocks = current_blocks;
    this->small_lwe_vector = small_lwe_vector;
    this->luts_message_carry = reused_lut;
+
    setup_index_buffers(streams, gpu_indexes, size_tracker);
  }

@@ -5910,6 +5939,9 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
    delete ct_prepared;
    ct_prepared = nullptr;

+    propagate_mem->release(streams, gpu_indexes, gpu_count);
+    delete propagate_mem;
+
    prepare_mem->release(streams, gpu_indexes, gpu_count);
    delete prepare_mem;
    prepare_mem = nullptr;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -365,13 +365,13 @@ host_integer_decompress(cudaStream_t const *streams,
      /// Make sure all data that should be on GPU 0 is indeed there
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-      /// With multiple GPUs we push to the vectors on each GPU then when we
-      /// gather data to GPU 0 we can copy back to the original indexing
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-          extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
-          lut->active_gpu_count, num_blocks_to_decompress,
-          compression_params.small_lwe_dimension + 1);
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        lut->active_gpu_count, num_blocks_to_decompress,
+        compression_params.small_lwe_dimension + 1);

      /// Apply PBS
      execute_pbs_async<Torus, Torus>(
@@ -385,12 +385,11 @@ host_integer_decompress(cudaStream_t const *streams,
          num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
          lut_stride);

-      /// Copy data back to GPU 0 and release vecs
-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
-          lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-          lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
-          encryption_params.big_lwe_dimension + 1);
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);

      /// Synchronize all GPUs
      for (uint i = 0; i < active_gpu_count; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -567,16 +567,22 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    // cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }

    /// With multiple GPUs we push to the vectors on each GPU then when we
    /// gather data to GPU 0 we can copy back to the original indexing
+    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
+        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
        big_lwe_dimension + 1);
-
+    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -595,16 +601,26 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                      (Torus *)lwe_array_out->ptr,
-                                      lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-                                      lut->using_trivial_lwe_indexes,
-                                      num_radix_blocks, big_lwe_dimension + 1);
-
-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    PUSH_RANGE("gather")
+    multi_gpu_gather_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
+    }
+
+    // /// Synchronize all GPUs
+    // for (uint i = 0; i < active_gpu_count; i++) {
+    //   cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    // }
  }
  for (uint i = 0; i < num_radix_blocks; i++) {
    auto degrees_index = lut->h_lut_indexes[i];
@@ -674,16 +690,20 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
+    // cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }
    /// With multiple GPUs we push to the vectors on each GPU then when we
    /// gather data to GPU 0 we can copy back to the original indexing
+    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
+        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
        big_lwe_dimension + 1);
-
+    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -702,16 +722,27 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
        num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
+    PUSH_RANGE("gather")
    multi_gpu_gather_many_lut_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
        lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
        num_many_lut);
+    POP_RANGE()

-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
+    }
+    // /// Synchronize all GPUs
+    // for (uint i = 0; i < active_gpu_count; i++) {
+    //   cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    // }
  }
  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
    auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
@@ -795,13 +826,18 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    // cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
+    }
+    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+        (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
        lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
        big_lwe_dimension + 1);
-
+    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -820,16 +856,26 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        num_many_lut, lut_stride);

    /// Copy data back to GPU 0 and release vecs
-    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
-                                      (Torus *)(lwe_array_out->ptr),
-                                      lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-                                      lut->using_trivial_lwe_indexes,
-                                      num_radix_blocks, big_lwe_dimension + 1);
-
-    /// Synchronize all GPUs
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    PUSH_RANGE("gather")
+    multi_gpu_gather_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        num_radix_blocks, big_lwe_dimension + 1);
+    POP_RANGE()
+    // other gpus record their events
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_gpu_count; j++) {
+      cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
+                             gpu_indexes[0]);
+    }
+
+    // /// Synchronize all GPUs
+    // for (uint i = 0; i < active_gpu_count; i++) {
+    //   cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    // }
  }
  for (uint i = 0; i < num_radix_blocks; i++) {
    auto degrees_index = lut->h_lut_indexes[i];
@@ -1000,7 +1046,6 @@ void generate_device_accumulator_no_encoding(
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-
  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
 }
@@ -1104,8 +1149,8 @@ void generate_device_accumulator_bivariate_with_factor(
      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
      factor);

-  cuda_synchronize_stream(stream, gpu_index);
-  // copy host lut and lut_indexes_vec to device
+  // cuda_synchronize_stream(stream, gpu_index);
+  //  copy host lut and lut_indexes_vec to device
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc_bivariate, h_lut,
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
@@ -1137,7 +1182,6 @@ void generate_device_accumulator_with_encoding(
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-
  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
 }
@@ -2363,7 +2407,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
    /// gather data to GPU 0 we can copy back to the original indexing
    multi_gpu_scatter_lwe_async<InputTorus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        (InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+        (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
        lut->using_trivial_lwe_indexes, lut->active_gpu_count,
        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -415,29 +415,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
          lut_stride);
    } else {
-      Torus *h_lwe_indexes_in_pinned;
-      Torus *h_lwe_indexes_out_pinned;
-      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
-                     total_ciphertexts * sizeof(Torus));
-      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
-                     total_ciphertexts * sizeof(Torus));
-      for (uint32_t i = 0; i < total_ciphertexts; i++) {
-        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
-        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
-      }
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
-          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
-          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      for (uint32_t i = 0; i < total_ciphertexts; i++) {
-        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
-        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
-      }
-      cudaFreeHost(h_lwe_indexes_in_pinned);
-      cudaFreeHost(h_lwe_indexes_out_pinned);

      luts_message_carry->broadcast_lut(streams, gpu_indexes);
      luts_message_carry->using_trivial_lwe_indexes = false;
@@ -491,29 +468,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          lut_stride);
    } else {
      uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
-      Torus *h_lwe_indexes_in_pinned;
-      Torus *h_lwe_indexes_out_pinned;
-      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
-                     num_blocks_in_apply_lut * sizeof(Torus));
-      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
-                     num_blocks_in_apply_lut * sizeof(Torus));
-      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
-        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
-        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
-      }
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
-          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
-          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
-        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
-        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
-      }
-      cudaFreeHost(h_lwe_indexes_in_pinned);
-      cudaFreeHost(h_lwe_indexes_out_pinned);

      luts_message_carry->broadcast_lut(streams, gpu_indexes);
      luts_message_carry->using_trivial_lwe_indexes = false;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -56,7 +56,7 @@ void host_integer_grouped_oprf(

    multi_gpu_scatter_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
-        seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
        active_gpu_count, num_blocks_to_process,
        mem_ptr->params.small_lwe_dimension + 1);

@@ -72,9 +72,8 @@ void host_integer_grouped_oprf(

    multi_gpu_gather_lwe_async<Torus>(
        streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
-        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
-        lut->using_trivial_lwe_indexes, num_blocks_to_process,
-        mem_ptr->params.big_lwe_dimension + 1);
+        lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        num_blocks_to_process, mem_ptr->params.big_lwe_dimension + 1);

    for (uint32_t i = 0; i < active_gpu_count; i++) {
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -93,6 +93,30 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
  }
 }

+// Each block handles one lwe
+template <typename Torus>
+__global__ void pack_data(Torus *d_packed_vector, Torus const *d_vector,
+                          Torus const *d_indexes, int lwe_size) {
+
+  int output_offset = blockIdx.x * lwe_size;
+  int input_offset = d_indexes[blockIdx.x] * lwe_size;
+  for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
+    d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
+  }
+}
+
+// Each block handles one lwe
+template <typename Torus>
+__global__ void unpack_data(Torus *d_vector, Torus const *d_packed_vector,
+                            Torus const *d_indexes, int lwe_size) {
+
+  int input_offset = blockIdx.x * lwe_size;
+  int output_offset = d_indexes[blockIdx.x] * lwe_size;
+  for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
+    d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
+  }
+}
+
 /// Load an array residing on one GPU to all active gpus
 /// and split the array among them.
 /// The input indexing logic is given by an index array.
@@ -102,7 +126,7 @@ template <typename Torus>
 void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, std::vector<Torus *> &dest,
-                                 Torus const *src, Torus const *h_src_indexes,
+                                 Torus const *src, Torus const *d_src_indexes,
                                 bool is_trivial_index,
                                 uint32_t max_active_gpu_count,
                                 uint32_t num_inputs, uint32_t lwe_size) {
@@ -110,7 +134,7 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
  if (max_active_gpu_count < gpu_count)
    PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
          "used to create the lut")
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  // cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -127,18 +151,28 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
          gpu_indexes[i], true);

    } else {
-      if (h_src_indexes == nullptr)
+      if (d_src_indexes == nullptr)
        PANIC("Cuda error: source indexes should be initialized!");
-      auto src_indexes = h_src_indexes + gpu_offset;
+      Torus *d_packed_vector = (Torus *)cuda_malloc_async(
+          inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);

-      for (uint j = 0; j < inputs_on_gpu; j++) {
-        auto d_dest = dest[i] + j * lwe_size;
-        auto d_src = src + src_indexes[j] * lwe_size;
+      cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);

-        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
-            true);
-      }
+      pack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
+          d_packed_vector, (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
+          lwe_size);
+      check_cuda_error(cudaGetLastError());
+      cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
+      cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
+
+      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+          dest[i], d_packed_vector, inputs_on_gpu * lwe_size * sizeof(Torus),
+          streams[i], gpu_indexes[i], true);
+
+      cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
+      cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
+      cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
+      cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
    }
  }
 }
@@ -150,7 +184,7 @@ template <typename Torus>
 void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
                                Torus *dest, const std::vector<Torus *> &src,
-                                Torus *h_dest_indexes, bool is_trivial_index,
+                                Torus *d_dest_indexes, bool is_trivial_index,
                                uint32_t num_inputs, uint32_t lwe_size) {

  for (uint i = 0; i < gpu_count; i++) {
@@ -168,19 +202,30 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
          d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
          gpu_indexes[i], true);
    } else {
-      if (h_dest_indexes == nullptr)
+      if (d_dest_indexes == nullptr)
        PANIC("Cuda error: destination indexes should be initialized!");

-      auto dest_indexes = h_dest_indexes + gpu_offset;
+      Torus *d_packed_vector = (Torus *)cuda_malloc_async(
+          inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);

-      for (uint j = 0; j < inputs_on_gpu; j++) {
-        auto d_dest = dest + dest_indexes[j] * lwe_size;
-        auto d_src = src[i] + j * lwe_size;
+      cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
+      cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);

-        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-            d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
-            true);
-      }
+      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+          d_packed_vector, src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
+          streams[i], gpu_indexes[i], true);
+      cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
+
+      cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
+      cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
+
+      unpack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
+          dest, d_packed_vector, (Torus *)d_dest_indexes + gpu_offset,
+          lwe_size);
+      check_cuda_error(cudaGetLastError());
+
+      cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
    }
  }
 }