mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
14 Commits
Workflows-
...
al/gpu_cou
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ba99b2feda | ||
|
|
b0dfd5609c | ||
|
|
5083efcdc9 | ||
|
|
2d676fe0cb | ||
|
|
ec18468ab4 | ||
|
|
1541281769 | ||
|
|
7e15535bda | ||
|
|
35ee34da1b | ||
|
|
589528ffd4 | ||
|
|
dfcc37ec98 | ||
|
|
0e56b47903 | ||
|
|
9fac4b81d1 | ||
|
|
c26637e6fa | ||
|
|
3bf603ae1e |
@@ -305,6 +305,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
///////////////
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
printf("Active GPUs in int_radix_lut: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
@@ -447,6 +448,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
// LUT is used as a trivial encryption and must be initialized outside
|
||||
// this constructor
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in int_radix_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_luts * lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
|
||||
@@ -520,6 +522,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
///////////////
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in int_radix_lut 2: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
@@ -827,6 +830,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {
|
||||
|
||||
///////////////
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in int_noise_squash_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
@@ -2789,6 +2793,7 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
}
|
||||
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in int_borrow_prop: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
@@ -2880,6 +2885,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in int_zero_out_if: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
|
||||
tmp = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -3285,6 +3291,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
printf("Active GPUs in int_arithmetic_scalar_shift: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
// In the arithmetic shift, a PBS has to be applied to the last rotated
|
||||
// block twice: once to shift it, once to compute the padding block to be
|
||||
// copied onto all blocks to the left of the last rotated block
|
||||
@@ -3920,6 +3927,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
this->is_signed = is_signed;
|
||||
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in int_compar: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
|
||||
identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
@@ -4425,6 +4433,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
printf("Active GPUs in int_div_rem: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
|
||||
this->params = params;
|
||||
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
|
||||
@@ -40,6 +40,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
printf("GPU %d\n", gpu_index);
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = (lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
@@ -218,6 +219,8 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
while (remaining_blocks > 0) {
|
||||
// Split in max_value chunks
|
||||
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
|
||||
cudaDeviceSynchronize();
|
||||
printf("Is at least one comparison block true chunks %d\n", num_chunks);
|
||||
|
||||
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
|
||||
// as in the worst case we will be adding `max_value` ones
|
||||
@@ -228,6 +231,10 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
for (int i = 0; i < num_chunks; i++) {
|
||||
uint32_t chunk_length =
|
||||
std::min(max_value, begin_remaining_blocks - i * max_value);
|
||||
cudaDeviceSynchronize();
|
||||
printf("chunk length %d, accumulator blocks: %d, input blocks: %d\n", chunk_length,
|
||||
buffer->tmp_block_accumulated->num_radix_blocks,
|
||||
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
|
||||
chunk_lengths[i] = chunk_length;
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
|
||||
input_blocks, big_lwe_dimension,
|
||||
@@ -243,6 +250,8 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
|
||||
// Applies the LUT
|
||||
if (remaining_blocks == 1) {
|
||||
cudaDeviceSynchronize();
|
||||
printf("Last lut\n");
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
@@ -250,6 +259,8 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
lut, 1);
|
||||
return;
|
||||
} else {
|
||||
cudaDeviceSynchronize();
|
||||
printf("lut with %d blocks\n", num_chunks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
||||
@@ -296,6 +307,8 @@ __host__ void host_compare_blocks_with_zero(
|
||||
// Accumulator
|
||||
auto sum = lwe_array_out;
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here in compare blocks with zero\n");
|
||||
if (num_radix_blocks == 1) {
|
||||
// Just copy
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
|
||||
@@ -305,10 +318,16 @@ __host__ void host_compare_blocks_with_zero(
|
||||
uint32_t remainder_blocks = num_radix_blocks;
|
||||
auto sum_i = (Torus *)sum->ptr;
|
||||
auto chunk = (Torus *)lwe_array_in->ptr;
|
||||
int blocks_check = sum->num_radix_blocks;
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here in compare blocks with zero sum %d input %d\n", sum->num_radix_blocks, lwe_array_in->num_radix_blocks);
|
||||
while (remainder_blocks > 1) {
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here in compare blocks with zero remainder blocks %d\n", remainder_blocks);
|
||||
uint32_t chunk_size =
|
||||
std::min(remainder_blocks, num_elements_to_fill_carry);
|
||||
|
||||
printf("Chunk size: %d, sum_i blocks: %d, remainder blocks: %d\n", chunk_size, blocks_check, remainder_blocks);
|
||||
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
|
||||
big_lwe_dimension, chunk_size);
|
||||
|
||||
@@ -318,8 +337,11 @@ __host__ void host_compare_blocks_with_zero(
|
||||
// Update operands
|
||||
chunk += (chunk_size - 1) * big_lwe_size;
|
||||
sum_i += big_lwe_size;
|
||||
blocks_check -= 1;
|
||||
}
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here in compare blocks with zero num sum blocks: %d\n", num_sum_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
|
||||
|
||||
@@ -80,6 +80,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
quotient, 0, num_blocks);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 0\n");
|
||||
|
||||
for (int i = total_bits - 1; i >= 0; i--) {
|
||||
uint32_t pos_in_block = i % num_bits_in_message;
|
||||
uint32_t msb_bit_set = total_bits - 1 - i;
|
||||
@@ -89,6 +92,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// and all blocks after it are also trivial zeros
|
||||
// This number is in range 1..=num_bocks -1
|
||||
uint32_t first_trivial_block = last_non_trivial_block + 1;
|
||||
printf("num blocks: %d, first trivial block: %d\n", num_blocks, first_trivial_block);
|
||||
reset_radix_ciphertext_blocks(interesting_remainder1, first_trivial_block);
|
||||
reset_radix_ciphertext_blocks(interesting_remainder2, first_trivial_block);
|
||||
reset_radix_ciphertext_blocks(interesting_divisor, first_trivial_block);
|
||||
@@ -243,18 +247,28 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 1\n");
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 2\n");
|
||||
// divisor_ms_blocks
|
||||
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
// interesting_remainder1
|
||||
// numerator_block_stack
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 3\n");
|
||||
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
|
||||
gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 4\n");
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 5\n");
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
@@ -318,6 +332,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
|
||||
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
compute_borrow, uses_input_borrow);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
};
|
||||
|
||||
// fills:
|
||||
@@ -326,6 +346,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
auto trivial_blocks = divisor_ms_blocks;
|
||||
printf("Trivial blocks: %d\n", trivial_blocks->num_radix_blocks);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
if (trivial_blocks->num_radix_blocks == 0) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 0,
|
||||
@@ -341,6 +368,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
trivial_blocks->num_radix_blocks,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
printf("Before is at least one comparisons block true %d\n", mem_ptr->tmp_1->num_radix_blocks);
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
|
||||
@@ -370,12 +399,20 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
}
|
||||
// new_remainder
|
||||
// subtraction_overflowed
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 6 before overlfow sub\n");
|
||||
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
// at_least_one_upper_block_is_non_zero
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 7\n");
|
||||
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
// cleaned_merged_interesting_remainder
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 8\n");
|
||||
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
|
||||
gpu_indexes, gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 9\n");
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
@@ -441,13 +478,21 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// cleaned_merged_interesting_remainder
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 10\n");
|
||||
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
|
||||
gpu_indexes, gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 11\n");
|
||||
// new_remainder
|
||||
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
|
||||
gpu_indexes, gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 12\n");
|
||||
// quotient
|
||||
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 13\n");
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
@@ -482,10 +527,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 14\n");
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
|
||||
num_blocks);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Here 15\n");
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
|
||||
|
||||
@@ -536,6 +536,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
printf("Active GPUs in lut univ: %d, gpu count: %d\n", active_gpu_count, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
@@ -2114,12 +2115,24 @@ void host_single_borrow_propagate(
|
||||
streams[0], gpu_indexes[0], lwe_array, lwe_array, input_borrow, 1,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_borrow_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array,
|
||||
mem->shifted_blocks_borrow_state_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_stride, num_many_lut);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem->overflow_block, 0, 1, borrow_states,
|
||||
@@ -2131,6 +2144,12 @@ void host_single_borrow_propagate(
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_groups);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
auto shifted_blocks =
|
||||
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
|
||||
auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
|
||||
@@ -2140,10 +2159,22 @@ void host_single_borrow_propagate(
|
||||
(Torus *)prepared_blocks->ptr, shifted_blocks,
|
||||
simulators, big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, prepared_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
|
||||
CudaRadixCiphertextFFI shifted_simulators;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -2152,6 +2183,12 @@ void host_single_borrow_propagate(
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
|
||||
mem->overflow_block, &shifted_simulators, 1);
|
||||
}
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
CudaRadixCiphertextFFI resolved_borrows;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
&resolved_borrows, mem->prop_simu_group_carries_mem->resolved_carries,
|
||||
@@ -2165,49 +2202,60 @@ void host_single_borrow_propagate(
|
||||
mem->overflow_block, &resolved_borrows, 1);
|
||||
}
|
||||
|
||||
cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
|
||||
for (int j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(mem->sub_streams_1[j], mem->incoming_events[0],
|
||||
gpu_indexes[j]);
|
||||
cuda_stream_wait_event(mem->sub_streams_2[j], mem->incoming_events[0],
|
||||
gpu_indexes[j]);
|
||||
}
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
|
||||
auto borrow_flag = mem->lut_borrow_flag;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
|
||||
streams, gpu_indexes, gpu_count, overflow_block,
|
||||
mem->overflow_block, bsks, ksks, ms_noise_reduction_key, borrow_flag,
|
||||
1);
|
||||
}
|
||||
for (int j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
|
||||
gpu_indexes[j]);
|
||||
}
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
// subtract borrow and cleanup prepared blocks
|
||||
auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
|
||||
host_negation<Torus>(
|
||||
mem->sub_streams_2[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
|
||||
streams[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
|
||||
(Torus *)resolved_carries->ptr, big_lwe_dimension, num_groups);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
host_radix_sum_in_groups<Torus>(
|
||||
mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
|
||||
streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
|
||||
resolved_carries, num_radix_blocks, mem->group_size);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
printf("lut blocks: %d, call with %d\n", message_extract->num_blocks, num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_2, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
|
||||
streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
|
||||
bsks, ksks, ms_noise_reduction_key, message_extract, num_radix_blocks);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
for (int j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j],
|
||||
gpu_indexes[j]);
|
||||
cuda_stream_wait_event(streams[0], mem->outgoing_events1[j],
|
||||
gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[0], mem->outgoing_events2[j],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
|
||||
/// num_radix_blocks corresponds to the number of blocks on which to apply the
|
||||
|
||||
@@ -165,10 +165,22 @@ __host__ void host_integer_overflowing_sub(
|
||||
stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus);
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
host_single_borrow_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
|
||||
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
cudaDeviceSynchronize();
|
||||
printf("Synchronize gpu %d\n", i);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -39,10 +39,12 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
|
||||
}
|
||||
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count) {
|
||||
int active_gpu_count = gpu_count;
|
||||
if (gpu_count > num_inputs) {
|
||||
active_gpu_count = num_inputs;
|
||||
}
|
||||
int threshold_number_of_inputs = 10;
|
||||
int ceil_div_inputs = std::max(1, (num_inputs + threshold_number_of_inputs - 1) /
|
||||
threshold_number_of_inputs);
|
||||
printf("ceil div inputs: %dn, gpu_count: %d\n", ceil_div_inputs, gpu_count);
|
||||
int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
|
||||
printf("active gpus: %d\n", active_gpu_count);
|
||||
return active_gpu_count;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user