feat(gpu): improve communication scheme

This commit is contained in:
Guillermo Oyarzun
2025-08-21 18:46:30 +02:00
committed by Agnès Leroy
parent e3686ed4ba
commit 88c3df8331
6 changed files with 195 additions and 122 deletions

View File

@@ -324,6 +324,10 @@ template <typename Torus> struct int_radix_lut {
uint32_t *gpu_indexes;
bool gpu_memory_allocated;
cudaEvent_t event_scatter_in;
cudaEvent_t event_scatter_out[8];
cudaEvent_t event_broadcast;
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
@@ -363,6 +367,15 @@ template <typename Torus> struct int_radix_lut {
buffer.push_back(gpu_pbs_buffer);
}
// if(active_gpu_count > 1){
event_scatter_in = cuda_create_event(gpu_indexes[0]);
event_broadcast = cuda_create_event(gpu_indexes[0]);
for (int i = 0; i < active_gpu_count; i++) {
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
}
//}
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
@@ -579,7 +592,14 @@ template <typename Torus> struct int_radix_lut {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
}
// if(active_gpu_count > 1){
event_scatter_in = cuda_create_event(gpu_indexes[0]);
event_broadcast = cuda_create_event(gpu_indexes[0]);
for (int i = 0; i < active_gpu_count; i++) {
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
}
//}
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
@@ -718,9 +738,10 @@ template <typename Torus> struct int_radix_lut {
auto src_lut = lut_vec[0];
auto src_lut_indexes = lut_indexes_vec[0];
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
if (gpu_indexes[i] != gpu_indexes[0]) {
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
auto dst_lut = lut_vec[i];
auto dst_lut_indexes = lut_indexes_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
@@ -745,7 +766,13 @@ template <typename Torus> struct int_radix_lut {
cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
gpu_indexes[i], gpu_memory_allocated);
}
// if( active_gpu_count > 1) {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
}
cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
cuda_event_destroy(event_broadcast, gpu_indexes[0]);
//}
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
@@ -1614,6 +1641,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->allocated_luts_message_carry = false;
this->reduce_degrees_for_single_carry_propagation =
reduce_degrees_for_single_carry_propagation;
setup_index_buffers(streams, gpu_indexes, size_tracker);
// because we setup_lut in host function for sum_ciphertexts to save memory
// the size_tracker is topped up here to have a max bound on the used memory
@@ -1661,6 +1689,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
this->luts_message_carry = reused_lut;
setup_index_buffers(streams, gpu_indexes, size_tracker);
}
@@ -5910,6 +5939,9 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
delete ct_prepared;
ct_prepared = nullptr;
propagate_mem->release(streams, gpu_indexes, gpu_count);
delete propagate_mem;
prepare_mem->release(streams, gpu_indexes, gpu_count);
delete prepare_mem;
prepare_mem = nullptr;

View File

@@ -365,13 +365,13 @@ host_integer_decompress(cudaStream_t const *streams,
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->active_gpu_count, num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->active_gpu_count, num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
/// Apply PBS
execute_pbs_async<Torus, Torus>(
@@ -385,12 +385,11 @@ host_integer_decompress(cudaStream_t const *streams,
num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
encryption_params.big_lwe_dimension + 1);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {

View File

@@ -567,16 +567,22 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -595,16 +601,26 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
(Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
PUSH_RANGE("gather")
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
// /// Synchronize all GPUs
// for (uint i = 0; i < active_gpu_count; i++) {
// cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// }
}
for (uint i = 0; i < num_radix_blocks; i++) {
auto degrees_index = lut->h_lut_indexes[i];
@@ -674,16 +690,20 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -702,16 +722,27 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
PUSH_RANGE("gather")
multi_gpu_gather_many_lut_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
num_many_lut);
POP_RANGE()
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
// /// Synchronize all GPUs
// for (uint i = 0; i < active_gpu_count; i++) {
// cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// }
}
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
@@ -795,13 +826,18 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
(Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -820,16 +856,26 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
(Torus *)(lwe_array_out->ptr),
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
PUSH_RANGE("gather")
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
// /// Synchronize all GPUs
// for (uint i = 0; i < active_gpu_count; i++) {
// cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// }
}
for (uint i = 0; i < num_radix_blocks; i++) {
auto degrees_index = lut->h_lut_indexes[i];
@@ -1000,7 +1046,6 @@ void generate_device_accumulator_no_encoding(
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
@@ -1104,8 +1149,8 @@ void generate_device_accumulator_bivariate_with_factor(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);
cuda_synchronize_stream(stream, gpu_index);
// copy host lut and lut_indexes_vec to device
// cuda_synchronize_stream(stream, gpu_index);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
@@ -1137,7 +1182,6 @@ void generate_device_accumulator_with_encoding(
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
@@ -2363,7 +2407,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
(InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count,
lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);

View File

@@ -415,29 +415,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
total_ciphertexts * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
total_ciphertexts * sizeof(Torus));
for (uint32_t i = 0; i < total_ciphertexts; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < total_ciphertexts; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
luts_message_carry->broadcast_lut(streams, gpu_indexes);
luts_message_carry->using_trivial_lwe_indexes = false;
@@ -491,29 +468,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
lut_stride);
} else {
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
luts_message_carry->broadcast_lut(streams, gpu_indexes);
luts_message_carry->using_trivial_lwe_indexes = false;

View File

@@ -56,7 +56,7 @@ void host_integer_grouped_oprf(
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
active_gpu_count, num_blocks_to_process,
mem_ptr->params.small_lwe_dimension + 1);
@@ -72,9 +72,8 @@ void host_integer_grouped_oprf(
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_blocks_to_process,
mem_ptr->params.big_lwe_dimension + 1);
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
num_blocks_to_process, mem_ptr->params.big_lwe_dimension + 1);
for (uint32_t i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);

View File

@@ -93,6 +93,30 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
}
}
// Each block handles one lwe
template <typename Torus>
__global__ void pack_data(Torus *d_packed_vector, Torus const *d_vector,
Torus const *d_indexes, int lwe_size) {
int output_offset = blockIdx.x * lwe_size;
int input_offset = d_indexes[blockIdx.x] * lwe_size;
for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
}
}
// Each block handles one lwe
template <typename Torus>
__global__ void unpack_data(Torus *d_vector, Torus const *d_packed_vector,
Torus const *d_indexes, int lwe_size) {
int input_offset = blockIdx.x * lwe_size;
int output_offset = d_indexes[blockIdx.x] * lwe_size;
for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
}
}
/// Load an array residing on one GPU to all active gpus
/// and split the array among them.
/// The input indexing logic is given by an index array.
@@ -102,7 +126,7 @@ template <typename Torus>
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
Torus const *src, Torus const *h_src_indexes,
Torus const *src, Torus const *d_src_indexes,
bool is_trivial_index,
uint32_t max_active_gpu_count,
uint32_t num_inputs, uint32_t lwe_size) {
@@ -110,7 +134,7 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
if (max_active_gpu_count < gpu_count)
PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
"used to create the lut")
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -127,18 +151,28 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
gpu_indexes[i], true);
} else {
if (h_src_indexes == nullptr)
if (d_src_indexes == nullptr)
PANIC("Cuda error: source indexes should be initialized!");
auto src_indexes = h_src_indexes + gpu_offset;
Torus *d_packed_vector = (Torus *)cuda_malloc_async(
inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
for (uint j = 0; j < inputs_on_gpu; j++) {
auto d_dest = dest[i] + j * lwe_size;
auto d_src = src + src_indexes[j] * lwe_size;
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
true);
}
pack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
d_packed_vector, (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
lwe_size);
check_cuda_error(cudaGetLastError());
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dest[i], d_packed_vector, inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i], true);
cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
}
}
}
@@ -150,7 +184,7 @@ template <typename Torus>
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dest, const std::vector<Torus *> &src,
Torus *h_dest_indexes, bool is_trivial_index,
Torus *d_dest_indexes, bool is_trivial_index,
uint32_t num_inputs, uint32_t lwe_size) {
for (uint i = 0; i < gpu_count; i++) {
@@ -168,19 +202,30 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
} else {
if (h_dest_indexes == nullptr)
if (d_dest_indexes == nullptr)
PANIC("Cuda error: destination indexes should be initialized!");
auto dest_indexes = h_dest_indexes + gpu_offset;
Torus *d_packed_vector = (Torus *)cuda_malloc_async(
inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
for (uint j = 0; j < inputs_on_gpu; j++) {
auto d_dest = dest + dest_indexes[j] * lwe_size;
auto d_src = src[i] + j * lwe_size;
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
true);
}
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_packed_vector, src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i], true);
cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
unpack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
dest, d_packed_vector, (Torus *)d_dest_indexes + gpu_offset,
lwe_size);
check_cuda_error(cudaGetLastError());
cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
}
}
}