mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
4 Commits
al/bench_m
...
al/div_mul
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d9a57edc1c | ||
|
|
22773e1f37 | ||
|
|
e0eab69e85 | ||
|
|
516ed11fad |
@@ -3685,9 +3685,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
[shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
|
||||
|
||||
masking_luts_1[i] = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, 1, true);
|
||||
streams, gpu_indexes, 1, params, 1, 1, true);
|
||||
masking_luts_2[i] = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
|
||||
streams, gpu_indexes, 1, params, 1, num_blocks, true);
|
||||
|
||||
int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
|
||||
|
||||
@@ -3704,7 +3704,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
// both of them are equal but because they are used in two different
|
||||
// executions in parallel we need two different pbs_buffers.
|
||||
message_extract_lut_1 = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
|
||||
streams, gpu_indexes, 1, params, 1, num_blocks, true);
|
||||
message_extract_lut_2 = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
|
||||
|
||||
@@ -3816,16 +3816,16 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
this->params = params;
|
||||
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
params, 2 * num_blocks, true);
|
||||
|
||||
shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
params, 2 * num_blocks, true);
|
||||
|
||||
uint32_t compute_overflow = 1;
|
||||
overflow_sub_mem = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
streams, gpu_indexes, 1, params, num_blocks, compute_overflow,
|
||||
true);
|
||||
uint32_t group_size = overflow_sub_mem->group_size;
|
||||
bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem
|
||||
@@ -3834,7 +3834,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
group_size, use_seq);
|
||||
|
||||
comparison_buffer = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
|
||||
streams, gpu_indexes, 1, COMPARISON_TYPE::NE, params,
|
||||
num_blocks, false, true);
|
||||
|
||||
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
|
||||
|
||||
@@ -107,6 +107,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
|
||||
// Selects a LUT
|
||||
int_radix_lut<Torus> *lut;
|
||||
auto broadcast_lut_should_be_called = false;
|
||||
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
|
||||
// is_non_zero_lut_buffer LUT
|
||||
lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
@@ -129,7 +130,8 @@ __host__ void are_all_comparisons_block_true(
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
is_equal_to_num_blocks_lut_f);
|
||||
|
||||
new_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
// new_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
broadcast_lut_should_be_called = true;
|
||||
|
||||
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
|
||||
lut = new_lut;
|
||||
@@ -140,13 +142,18 @@ __host__ void are_all_comparisons_block_true(
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
|
||||
ksks, 1, lut);
|
||||
streams, gpu_indexes, 1, lwe_array_out, accumulator, bsks, ksks, 1,
|
||||
lut);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
|
||||
num_chunks, lut);
|
||||
if (broadcast_lut_should_be_called)
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, 1, tmp_out, accumulator, bsks, ksks,
|
||||
num_chunks, lut);
|
||||
else
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
|
||||
num_chunks, lut);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -451,12 +458,10 @@ __host__ void tree_sign_reduction(
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f);
|
||||
last_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
|
||||
last_lut);
|
||||
streams, gpu_indexes, 1, lwe_array_out, y, bsks, ksks, 1, last_lut);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -286,7 +286,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
uint32_t shifted_mask = full_message_mask >> shift_amount;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
|
||||
streams, gpu_indexes, 1, interesting_divisor.last_block(),
|
||||
interesting_divisor.last_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_1[shifted_mask]);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
@@ -315,7 +315,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
|
||||
streams, gpu_indexes, 1, divisor_ms_blocks.first_block(),
|
||||
divisor_ms_blocks.first_block(), bsks, ksks, 1,
|
||||
mem_ptr->masking_luts_2[shifted_mask]);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
@@ -340,7 +340,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
|
||||
streams, gpu_indexes, 1, interesting_remainder1.data, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
|
||||
|
||||
tmp_radix.clone_from(interesting_remainder1, 0,
|
||||
@@ -370,13 +370,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
|
||||
streams, gpu_indexes, 1, interesting_remainder2.data, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
//for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
//}
|
||||
// interesting_divisor
|
||||
trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count);
|
||||
@@ -389,12 +389,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// interesting_remainder2
|
||||
left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
|
||||
gpu_count);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
}
|
||||
// for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[0], gpu_indexes[0]);
|
||||
// }
|
||||
|
||||
// if interesting_remainder1 != 0 -> interesting_remainder2 == 0
|
||||
// if interesting_remainder1 == 0 -> interesting_remainder2 != 0
|
||||
@@ -438,7 +438,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
|
||||
merged_interesting_remainder.len);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder.data,
|
||||
streams, gpu_indexes, 1, new_remainder.data,
|
||||
(uint64_t *)merged_interesting_remainder.data,
|
||||
interesting_divisor.data, subtraction_overflowed.data,
|
||||
(const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
|
||||
@@ -460,7 +460,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// But we are in the special case where scalar == 0
|
||||
// So we can skip some stuff
|
||||
host_compare_with_zero_equality<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
|
||||
streams, gpu_indexes, 1, tmp_1.data, trivial_blocks.data,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
@@ -468,7 +468,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
|
||||
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
streams, gpu_indexes, 1,
|
||||
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
|
||||
}
|
||||
@@ -482,7 +482,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
streams, gpu_indexes, 1,
|
||||
cleaned_merged_interesting_remainder.data,
|
||||
cleaned_merged_interesting_remainder.data, bsks, ksks,
|
||||
cleaned_merged_interesting_remainder.len,
|
||||
|
||||
Reference in New Issue
Block a user