mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
feat(gpu): improve communication scheme
This commit is contained in:
committed by
Agnès Leroy
parent
e3686ed4ba
commit
88c3df8331
@@ -324,6 +324,10 @@ template <typename Torus> struct int_radix_lut {
|
||||
uint32_t *gpu_indexes;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
cudaEvent_t event_scatter_in;
|
||||
cudaEvent_t event_scatter_out[8];
|
||||
cudaEvent_t event_broadcast;
|
||||
|
||||
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
@@ -363,6 +367,15 @@ template <typename Torus> struct int_radix_lut {
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
|
||||
// if(active_gpu_count > 1){
|
||||
event_scatter_in = cuda_create_event(gpu_indexes[0]);
|
||||
event_broadcast = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
for (int i = 0; i < active_gpu_count; i++) {
|
||||
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
|
||||
}
|
||||
//}
|
||||
|
||||
// Allocate LUT
|
||||
// LUT is used as a trivial encryption and must be initialized outside
|
||||
// this constructor
|
||||
@@ -579,7 +592,14 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
// if(active_gpu_count > 1){
|
||||
event_scatter_in = cuda_create_event(gpu_indexes[0]);
|
||||
event_broadcast = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
for (int i = 0; i < active_gpu_count; i++) {
|
||||
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
|
||||
}
|
||||
//}
|
||||
// Allocate LUT
|
||||
// LUT is used as a trivial encryption and must be initialized outside
|
||||
// this constructor
|
||||
@@ -718,9 +738,10 @@ template <typename Torus> struct int_radix_lut {
|
||||
auto src_lut = lut_vec[0];
|
||||
auto src_lut_indexes = lut_indexes_vec[0];
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
if (gpu_indexes[i] != gpu_indexes[0]) {
|
||||
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
|
||||
auto dst_lut = lut_vec[i];
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
@@ -745,7 +766,13 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
|
||||
// if( active_gpu_count > 1) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
|
||||
}
|
||||
cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
|
||||
cuda_event_destroy(event_broadcast, gpu_indexes[0]);
|
||||
//}
|
||||
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
|
||||
@@ -1614,6 +1641,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->allocated_luts_message_carry = false;
|
||||
this->reduce_degrees_for_single_carry_propagation =
|
||||
reduce_degrees_for_single_carry_propagation;
|
||||
|
||||
setup_index_buffers(streams, gpu_indexes, size_tracker);
|
||||
// because we setup_lut in host function for sum_ciphertexts to save memory
|
||||
// the size_tracker is topped up here to have a max bound on the used memory
|
||||
@@ -1661,6 +1689,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->current_blocks = current_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
this->luts_message_carry = reused_lut;
|
||||
|
||||
setup_index_buffers(streams, gpu_indexes, size_tracker);
|
||||
}
|
||||
|
||||
@@ -5910,6 +5939,9 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
|
||||
delete ct_prepared;
|
||||
ct_prepared = nullptr;
|
||||
|
||||
propagate_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete propagate_mem;
|
||||
|
||||
prepare_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete prepare_mem;
|
||||
prepare_mem = nullptr;
|
||||
|
||||
@@ -365,13 +365,13 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->active_gpu_count, num_blocks_to_decompress,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->active_gpu_count, num_blocks_to_decompress,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
@@ -385,12 +385,11 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
|
||||
@@ -567,16 +567,22 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
|
||||
}
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
PUSH_RANGE("scatter")
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
|
||||
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
@@ -595,16 +601,26 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
(Torus *)lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
PUSH_RANGE("gather")
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
|
||||
// /// Synchronize all GPUs
|
||||
// for (uint i = 0; i < active_gpu_count; i++) {
|
||||
// cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
// }
|
||||
}
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
@@ -674,16 +690,20 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
|
||||
}
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
PUSH_RANGE("scatter")
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
|
||||
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
@@ -702,16 +722,27 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
PUSH_RANGE("gather")
|
||||
multi_gpu_gather_many_lut_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
|
||||
num_many_lut);
|
||||
POP_RANGE()
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
// /// Synchronize all GPUs
|
||||
// for (uint i = 0; i < active_gpu_count; i++) {
|
||||
// cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
// }
|
||||
}
|
||||
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
|
||||
@@ -795,13 +826,18 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
|
||||
}
|
||||
PUSH_RANGE("scatter")
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
|
||||
(Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec,
|
||||
@@ -820,16 +856,26 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
(Torus *)(lwe_array_out->ptr),
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
PUSH_RANGE("gather")
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
|
||||
// /// Synchronize all GPUs
|
||||
// for (uint i = 0; i < active_gpu_count; i++) {
|
||||
// cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
// }
|
||||
}
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
@@ -1000,7 +1046,6 @@ void generate_device_accumulator_no_encoding(
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
@@ -1104,8 +1149,8 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
|
||||
factor);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
// cuda_synchronize_stream(stream, gpu_index);
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc_bivariate, h_lut,
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
|
||||
@@ -1137,7 +1182,6 @@ void generate_device_accumulator_with_encoding(
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
@@ -2363,7 +2407,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
|
||||
(InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count,
|
||||
lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
|
||||
|
||||
|
||||
@@ -415,29 +415,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
Torus *h_lwe_indexes_in_pinned;
|
||||
Torus *h_lwe_indexes_out_pinned;
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
|
||||
total_ciphertexts * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
|
||||
total_ciphertexts * sizeof(Torus));
|
||||
for (uint32_t i = 0; i < total_ciphertexts; i++) {
|
||||
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
|
||||
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
|
||||
}
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < total_ciphertexts; i++) {
|
||||
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
|
||||
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
|
||||
}
|
||||
cudaFreeHost(h_lwe_indexes_in_pinned);
|
||||
cudaFreeHost(h_lwe_indexes_out_pinned);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
@@ -491,29 +468,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
lut_stride);
|
||||
} else {
|
||||
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
|
||||
Torus *h_lwe_indexes_in_pinned;
|
||||
Torus *h_lwe_indexes_out_pinned;
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
|
||||
num_blocks_in_apply_lut * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
|
||||
num_blocks_in_apply_lut * sizeof(Torus));
|
||||
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
|
||||
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
|
||||
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
|
||||
}
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
|
||||
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
|
||||
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
|
||||
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
|
||||
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
|
||||
}
|
||||
cudaFreeHost(h_lwe_indexes_in_pinned);
|
||||
cudaFreeHost(h_lwe_indexes_out_pinned);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
@@ -56,7 +56,7 @@ void host_integer_grouped_oprf(
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
active_gpu_count, num_blocks_to_process,
|
||||
mem_ptr->params.small_lwe_dimension + 1);
|
||||
|
||||
@@ -72,9 +72,8 @@ void host_integer_grouped_oprf(
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_blocks_to_process,
|
||||
mem_ptr->params.big_lwe_dimension + 1);
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
num_blocks_to_process, mem_ptr->params.big_lwe_dimension + 1);
|
||||
|
||||
for (uint32_t i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
|
||||
@@ -93,6 +93,30 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
}
|
||||
}
|
||||
|
||||
// Each block handles one lwe
|
||||
template <typename Torus>
|
||||
__global__ void pack_data(Torus *d_packed_vector, Torus const *d_vector,
|
||||
Torus const *d_indexes, int lwe_size) {
|
||||
|
||||
int output_offset = blockIdx.x * lwe_size;
|
||||
int input_offset = d_indexes[blockIdx.x] * lwe_size;
|
||||
for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
|
||||
d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
|
||||
}
|
||||
}
|
||||
|
||||
// Each block handles one lwe
|
||||
template <typename Torus>
|
||||
__global__ void unpack_data(Torus *d_vector, Torus const *d_packed_vector,
|
||||
Torus const *d_indexes, int lwe_size) {
|
||||
|
||||
int input_offset = blockIdx.x * lwe_size;
|
||||
int output_offset = d_indexes[blockIdx.x] * lwe_size;
|
||||
for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
|
||||
d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
|
||||
}
|
||||
}
|
||||
|
||||
/// Load an array residing on one GPU to all active gpus
|
||||
/// and split the array among them.
|
||||
/// The input indexing logic is given by an index array.
|
||||
@@ -102,7 +126,7 @@ template <typename Torus>
|
||||
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
Torus const *src, Torus const *h_src_indexes,
|
||||
Torus const *src, Torus const *d_src_indexes,
|
||||
bool is_trivial_index,
|
||||
uint32_t max_active_gpu_count,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
@@ -110,7 +134,7 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
if (max_active_gpu_count < gpu_count)
|
||||
PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
|
||||
"used to create the lut")
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
// cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
|
||||
@@ -127,18 +151,28 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
gpu_indexes[i], true);
|
||||
|
||||
} else {
|
||||
if (h_src_indexes == nullptr)
|
||||
if (d_src_indexes == nullptr)
|
||||
PANIC("Cuda error: source indexes should be initialized!");
|
||||
auto src_indexes = h_src_indexes + gpu_offset;
|
||||
Torus *d_packed_vector = (Torus *)cuda_malloc_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
for (uint j = 0; j < inputs_on_gpu; j++) {
|
||||
auto d_dest = dest[i] + j * lwe_size;
|
||||
auto d_src = src + src_indexes[j] * lwe_size;
|
||||
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
true);
|
||||
}
|
||||
pack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
d_packed_vector, (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
|
||||
lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dest[i], d_packed_vector, inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], true);
|
||||
|
||||
cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
|
||||
cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
|
||||
cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
|
||||
cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -150,7 +184,7 @@ template <typename Torus>
|
||||
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *dest, const std::vector<Torus *> &src,
|
||||
Torus *h_dest_indexes, bool is_trivial_index,
|
||||
Torus *d_dest_indexes, bool is_trivial_index,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
@@ -168,19 +202,30 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], true);
|
||||
} else {
|
||||
if (h_dest_indexes == nullptr)
|
||||
if (d_dest_indexes == nullptr)
|
||||
PANIC("Cuda error: destination indexes should be initialized!");
|
||||
|
||||
auto dest_indexes = h_dest_indexes + gpu_offset;
|
||||
Torus *d_packed_vector = (Torus *)cuda_malloc_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
for (uint j = 0; j < inputs_on_gpu; j++) {
|
||||
auto d_dest = dest + dest_indexes[j] * lwe_size;
|
||||
auto d_src = src[i] + j * lwe_size;
|
||||
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
|
||||
true);
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
d_packed_vector, src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], true);
|
||||
cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
|
||||
|
||||
cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
|
||||
cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
|
||||
|
||||
unpack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
dest, d_packed_vector, (Torus *)d_dest_indexes + gpu_offset,
|
||||
lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user