Compare commits

...

4 Commits

Author SHA1 Message Date
Agnes Leroy
d9a57edc1c Continue restricting 2024-12-11 18:18:59 +01:00
Agnes Leroy
22773e1f37 Restrict num gpus further in div 2024-12-11 18:01:26 +01:00
Agnes Leroy
e0eab69e85 Restrict gpus to 1 in fdiv comparisons 2024-12-11 17:32:14 +01:00
Agnes Leroy
516ed11fad chore(gpu): use only 1 GPU in comparison if broadcast lut is called 2024-12-11 17:05:19 +01:00
3 changed files with 38 additions and 33 deletions

View File

@@ -3685,9 +3685,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
[shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
masking_luts_1[i] = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, 1, true);
streams, gpu_indexes, 1, params, 1, 1, true);
masking_luts_2[i] = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
streams, gpu_indexes, 1, params, 1, num_blocks, true);
int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
@@ -3704,7 +3704,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// both of them are equal but because they are used in two different
// executions in parallel we need two different pbs_buffers.
message_extract_lut_1 = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
streams, gpu_indexes, 1, params, 1, num_blocks, true);
message_extract_lut_2 = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
@@ -3816,16 +3816,16 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
this->params = params;
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
params, 2 * num_blocks, true);
shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
params, 2 * num_blocks, true);
uint32_t compute_overflow = 1;
overflow_sub_mem = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
streams, gpu_indexes, 1, params, num_blocks, compute_overflow,
true);
uint32_t group_size = overflow_sub_mem->group_size;
bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem
@@ -3834,7 +3834,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
group_size, use_seq);
comparison_buffer = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
streams, gpu_indexes, 1, COMPARISON_TYPE::NE, params,
num_blocks, false, true);
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);

View File

@@ -107,6 +107,7 @@ __host__ void are_all_comparisons_block_true(
// Selects a LUT
int_radix_lut<Torus> *lut;
auto broadcast_lut_should_be_called = false;
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
@@ -129,7 +130,8 @@ __host__ void are_all_comparisons_block_true(
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f);
new_lut->broadcast_lut(streams, gpu_indexes, 0);
// new_lut->broadcast_lut(streams, gpu_indexes, 0);
broadcast_lut_should_be_called = true;
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
lut = new_lut;
@@ -140,13 +142,18 @@ __host__ void are_all_comparisons_block_true(
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
streams, gpu_indexes, 1, lwe_array_out, accumulator, bsks, ksks, 1,
lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
if (broadcast_lut_should_be_called)
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, 1, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
else
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
}
}
}
@@ -451,12 +458,10 @@ __host__ void tree_sign_reduction(
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f);
last_lut->broadcast_lut(streams, gpu_indexes, 0);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
last_lut);
streams, gpu_indexes, 1, lwe_array_out, y, bsks, ksks, 1, last_lut);
}
template <typename Torus>

View File

@@ -286,7 +286,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
uint32_t shifted_mask = full_message_mask >> shift_amount;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
streams, gpu_indexes, 1, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
}; // trim_last_interesting_divisor_bits
@@ -315,7 +315,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
streams, gpu_indexes, 1, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
}; // trim_first_divisor_ms_bits
@@ -340,7 +340,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
streams[0], gpu_indexes[0]);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
streams, gpu_indexes, 1, interesting_remainder1.data, 1,
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
tmp_radix.clone_from(interesting_remainder1, 0,
@@ -370,13 +370,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
uint32_t const *gpu_indexes,
uint32_t gpu_count) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
streams, gpu_indexes, 1, interesting_remainder2.data, 1,
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
}; // left_shift_interesting_remainder2
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
//for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
//}
// interesting_divisor
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
gpu_count);
@@ -389,12 +389,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
// interesting_remainder2
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
}
// for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[0], gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[0], gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[0], gpu_indexes[0]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[0], gpu_indexes[0]);
// }
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
@@ -438,7 +438,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
merged_interesting_remainder.len);
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, new_remainder.data,
streams, gpu_indexes, 1, new_remainder.data,
(uint64_t *)merged_interesting_remainder.data,
interesting_divisor.data, subtraction_overflowed.data,
(const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
@@ -460,7 +460,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// But we are in the special case where scalar == 0
// So we can skip some stuff
host_compare_with_zero_equality<Torus>(
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
streams, gpu_indexes, 1, tmp_1.data, trivial_blocks.data,
mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -468,7 +468,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count,
streams, gpu_indexes, 1,
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
}
@@ -482,7 +482,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
streams, gpu_indexes, 1,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsks, ksks,
cleaned_merged_interesting_remainder.len,