Compare commits

...

14 Commits

Author SHA1 Message Date
Agnes Leroy
ba99b2feda print 2025-07-16 11:18:32 +02:00
Agnes Leroy
b0dfd5609c debug 2025-07-16 11:13:44 +02:00
Agnes Leroy
5083efcdc9 debug 2025-07-16 11:09:29 +02:00
Agnes Leroy
2d676fe0cb debug 2025-07-16 11:04:00 +02:00
Agnes Leroy
ec18468ab4 debug 2025-07-16 10:56:45 +02:00
Agnes Leroy
1541281769 debug 2025-07-16 10:52:44 +02:00
Agnes Leroy
7e15535bda print 2025-07-16 10:46:10 +02:00
Agnes Leroy
35ee34da1b print 2025-07-15 16:22:43 +02:00
Agnes Leroy
589528ffd4 print 2025-07-15 16:12:18 +02:00
Agnes Leroy
dfcc37ec98 lower threshold for testing 2025-07-15 15:55:20 +02:00
Agnes Leroy
0e56b47903 print 2025-07-15 15:54:15 +02:00
Agnes Leroy
9fac4b81d1 print 2025-07-15 12:03:22 +02:00
Agnes Leroy
c26637e6fa debug 2025-07-15 11:48:17 +02:00
Agnes Leroy
3bf603ae1e chore(gpu): change multi gpu logic 2025-07-15 11:31:04 +02:00
6 changed files with 170 additions and 28 deletions

View File

@@ -305,6 +305,7 @@ template <typename Torus> struct int_radix_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
printf("Active GPUs in int_radix_lut: %d, gpu count: %d\n", active_gpu_count, gpu_count);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
@@ -447,6 +448,7 @@ template <typename Torus> struct int_radix_lut {
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in int_radix_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
for (uint i = 0; i < active_gpu_count; i++) {
auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
num_luts * lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
@@ -520,6 +522,7 @@ template <typename Torus> struct int_radix_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in int_radix_lut 2: %d, gpu count: %d\n", active_gpu_count, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
@@ -827,6 +830,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in int_noise_squash_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
@@ -2789,6 +2793,7 @@ template <typename Torus> struct int_borrow_prop_memory {
}
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in int_borrow_prop: %d, gpu count: %d\n", active_gpu_count, gpu_count);
sub_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_2 =
@@ -2880,6 +2885,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in int_zero_out_if: %d, gpu count: %d\n", active_gpu_count, gpu_count);
tmp = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -3285,6 +3291,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
active_gpu_count = get_active_gpu_count(1, gpu_count);
printf("Active GPUs in int_arithmetic_scalar_shift: %d, gpu count: %d\n", active_gpu_count, gpu_count);
// In the arithmetic shift, a PBS has to be applied to the last rotated
// block twice: once to shift it, once to compute the padding block to be
// copied onto all blocks to the left of the last rotated block
@@ -3920,6 +3927,7 @@ template <typename Torus> struct int_comparison_buffer {
this->is_signed = is_signed;
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in int_compar: %d, gpu count: %d\n", active_gpu_count, gpu_count);
identity_lut_f = [](Torus x) -> Torus { return x; };
@@ -4425,6 +4433,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
printf("Active GPUs in int_div_rem: %d, gpu count: %d\n", active_gpu_count, gpu_count);
this->params = params;
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(

View File

@@ -40,6 +40,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
uint32_t num_radix_blocks) {
cuda_set_device(gpu_index);
printf("GPU %d\n", gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -218,6 +219,8 @@ __host__ void is_at_least_one_comparisons_block_true(
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
cudaDeviceSynchronize();
printf("Is at least one comparison block true chunks %d\n", num_chunks);
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
@@ -228,6 +231,10 @@ __host__ void is_at_least_one_comparisons_block_true(
for (int i = 0; i < num_chunks; i++) {
uint32_t chunk_length =
std::min(max_value, begin_remaining_blocks - i * max_value);
cudaDeviceSynchronize();
printf("chunk length %d, accumulator blocks: %d, input blocks: %d\n", chunk_length,
buffer->tmp_block_accumulated->num_radix_blocks,
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
chunk_lengths[i] = chunk_length;
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension,
@@ -243,6 +250,8 @@ __host__ void is_at_least_one_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
cudaDeviceSynchronize();
printf("Last lut\n");
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -250,6 +259,8 @@ __host__ void is_at_least_one_comparisons_block_true(
lut, 1);
return;
} else {
cudaDeviceSynchronize();
printf("lut with %d blocks\n", num_chunks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
@@ -296,6 +307,8 @@ __host__ void host_compare_blocks_with_zero(
// Accumulator
auto sum = lwe_array_out;
cudaDeviceSynchronize();
printf("Here in compare blocks with zero\n");
if (num_radix_blocks == 1) {
// Just copy
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
@@ -305,10 +318,16 @@ __host__ void host_compare_blocks_with_zero(
uint32_t remainder_blocks = num_radix_blocks;
auto sum_i = (Torus *)sum->ptr;
auto chunk = (Torus *)lwe_array_in->ptr;
int blocks_check = sum->num_radix_blocks;
cudaDeviceSynchronize();
printf("Here in compare blocks with zero sum %d input %d\n", sum->num_radix_blocks, lwe_array_in->num_radix_blocks);
while (remainder_blocks > 1) {
cudaDeviceSynchronize();
printf("Here in compare blocks with zero remainder blocks %d\n", remainder_blocks);
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);
printf("Chunk size: %d, sum_i blocks: %d, remainder blocks: %d\n", chunk_size, blocks_check, remainder_blocks);
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);
@@ -318,8 +337,11 @@ __host__ void host_compare_blocks_with_zero(
// Update operands
chunk += (chunk_size - 1) * big_lwe_size;
sum_i += big_lwe_size;
blocks_check -= 1;
}
}
cudaDeviceSynchronize();
printf("Here in compare blocks with zero num sum blocks: %d\n", num_sum_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,

View File

@@ -80,6 +80,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
quotient, 0, num_blocks);
cudaDeviceSynchronize();
printf("Here 0\n");
for (int i = total_bits - 1; i >= 0; i--) {
uint32_t pos_in_block = i % num_bits_in_message;
uint32_t msb_bit_set = total_bits - 1 - i;
@@ -89,6 +92,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// and all blocks after it are also trivial zeros
// This number is in range 1..=num_bocks -1
uint32_t first_trivial_block = last_non_trivial_block + 1;
printf("num blocks: %d, first trivial block: %d\n", num_blocks, first_trivial_block);
reset_radix_ciphertext_blocks(interesting_remainder1, first_trivial_block);
reset_radix_ciphertext_blocks(interesting_remainder2, first_trivial_block);
reset_radix_ciphertext_blocks(interesting_divisor, first_trivial_block);
@@ -243,18 +247,28 @@ __host__ void host_unsigned_integer_div_rem_kb(
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
cudaDeviceSynchronize();
printf("Here 1\n");
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
gpu_count);
cudaDeviceSynchronize();
printf("Here 2\n");
// divisor_ms_blocks
trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
// interesting_remainder1
// numerator_block_stack
cudaDeviceSynchronize();
printf("Here 3\n");
left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
gpu_count);
cudaDeviceSynchronize();
printf("Here 4\n");
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
gpu_count);
cudaDeviceSynchronize();
printf("Here 5\n");
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -318,6 +332,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
compute_borrow, uses_input_borrow);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
};
// fills:
@@ -326,6 +346,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
auto trivial_blocks = divisor_ms_blocks;
printf("Trivial blocks: %d\n", trivial_blocks->num_radix_blocks);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
if (trivial_blocks->num_radix_blocks == 0) {
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 0,
@@ -341,6 +368,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
trivial_blocks->num_radix_blocks,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
cudaDeviceSynchronize();
printf("Before is at least one comparisons block true %d\n", mem_ptr->tmp_1->num_radix_blocks);
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count,
at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
@@ -370,12 +399,20 @@ __host__ void host_unsigned_integer_div_rem_kb(
}
// new_remainder
// subtraction_overflowed
cudaDeviceSynchronize();
printf("Here 6 before overlfow sub\n");
do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
// at_least_one_upper_block_is_non_zero
cudaDeviceSynchronize();
printf("Here 7\n");
check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
// cleaned_merged_interesting_remainder
cudaDeviceSynchronize();
printf("Here 8\n");
create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
gpu_indexes, gpu_count);
cudaDeviceSynchronize();
printf("Here 9\n");
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -441,13 +478,21 @@ __host__ void host_unsigned_integer_div_rem_kb(
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
// cleaned_merged_interesting_remainder
cudaDeviceSynchronize();
printf("Here 10\n");
conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
gpu_indexes, gpu_count);
cudaDeviceSynchronize();
printf("Here 11\n");
// new_remainder
conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
gpu_indexes, gpu_count);
cudaDeviceSynchronize();
printf("Here 12\n");
// quotient
set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
cudaDeviceSynchronize();
printf("Here 13\n");
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -482,10 +527,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
cudaDeviceSynchronize();
printf("Here 14\n");
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
num_blocks);
cudaDeviceSynchronize();
printf("Here 15\n");
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);

View File

@@ -536,6 +536,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
printf("Active GPUs in lut univ: %d, gpu count: %d\n", active_gpu_count, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, lwe_after_ks_vec[0],
@@ -2114,12 +2115,24 @@ void host_single_borrow_propagate(
streams[0], gpu_indexes[0], lwe_array, lwe_array, input_borrow, 1,
message_modulus, carry_modulus);
}
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
// Step 1
host_compute_shifted_blocks_and_borrow_states<Torus>(
streams, gpu_indexes, gpu_count, lwe_array,
mem->shifted_blocks_borrow_state_mem, bsks, ksks, ms_noise_reduction_key,
lut_stride, num_many_lut);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem->overflow_block, 0, 1, borrow_states,
@@ -2131,6 +2144,12 @@ void host_single_borrow_propagate(
mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_groups);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
auto shifted_blocks =
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
@@ -2140,10 +2159,22 @@ void host_single_borrow_propagate(
(Torus *)prepared_blocks->ptr, shifted_blocks,
simulators, big_lwe_dimension, num_radix_blocks);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, prepared_blocks, message_modulus,
carry_modulus);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
CudaRadixCiphertextFFI shifted_simulators;
as_radix_ciphertext_slice<Torus>(
@@ -2152,6 +2183,12 @@ void host_single_borrow_propagate(
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
mem->overflow_block, &shifted_simulators, 1);
}
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
CudaRadixCiphertextFFI resolved_borrows;
as_radix_ciphertext_slice<Torus>(
&resolved_borrows, mem->prop_simu_group_carries_mem->resolved_carries,
@@ -2165,49 +2202,60 @@ void host_single_borrow_propagate(
mem->overflow_block, &resolved_borrows, 1);
}
cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
for (int j = 0; j < mem->active_gpu_count; j++) {
cuda_stream_wait_event(mem->sub_streams_1[j], mem->incoming_events[0],
gpu_indexes[j]);
cuda_stream_wait_event(mem->sub_streams_2[j], mem->incoming_events[0],
gpu_indexes[j]);
}
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
auto borrow_flag = mem->lut_borrow_flag;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
streams, gpu_indexes, gpu_count, overflow_block,
mem->overflow_block, bsks, ksks, ms_noise_reduction_key, borrow_flag,
1);
}
for (int j = 0; j < mem->active_gpu_count; j++) {
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
gpu_indexes[j]);
}
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
// subtract borrow and cleanup prepared blocks
auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
host_negation<Torus>(
mem->sub_streams_2[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
streams[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
(Torus *)resolved_carries->ptr, big_lwe_dimension, num_groups);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
host_radix_sum_in_groups<Torus>(
mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
resolved_carries, num_radix_blocks, mem->group_size);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
auto message_extract = mem->lut_message_extract;
printf("lut blocks: %d, call with %d\n", message_extract->num_blocks, num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_2, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
bsks, ksks, ms_noise_reduction_key, message_extract, num_radix_blocks);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
for (int j = 0; j < mem->active_gpu_count; j++) {
cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j],
gpu_indexes[j]);
cuda_stream_wait_event(streams[0], mem->outgoing_events1[j],
gpu_indexes[0]);
cuda_stream_wait_event(streams[0], mem->outgoing_events2[j],
gpu_indexes[0]);
}
}
/// num_radix_blocks corresponds to the number of blocks on which to apply the

View File

@@ -165,10 +165,22 @@ __host__ void host_integer_overflowing_sub(
stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
radix_params.message_modulus, radix_params.carry_modulus);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
host_single_borrow_propagate<Torus>(
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
for (uint i = 0; i < gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
cudaDeviceSynchronize();
printf("Synchronize gpu %d\n", i);
check_cuda_error(cudaGetLastError());
}
POP_RANGE()
}

View File

@@ -39,10 +39,12 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
}
int get_active_gpu_count(int num_inputs, int gpu_count) {
int active_gpu_count = gpu_count;
if (gpu_count > num_inputs) {
active_gpu_count = num_inputs;
}
int threshold_number_of_inputs = 10;
int ceil_div_inputs = std::max(1, (num_inputs + threshold_number_of_inputs - 1) /
threshold_number_of_inputs);
printf("ceil div inputs: %dn, gpu_count: %d\n", ceil_div_inputs, gpu_count);
int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
printf("active gpus: %d\n", active_gpu_count);
return active_gpu_count;
}