mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
feat(gpu): div_rem_2_2_blocks
This commit is contained in:
@@ -4153,6 +4153,128 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sub_and_propagate {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
|
||||
CudaRadixCiphertextFFI *neg_rhs_array;
|
||||
|
||||
int_sc_prop_memory<Torus> *sc_prop_mem;
|
||||
|
||||
int_sub_and_propagate(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
const int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->neg_rhs_array = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sc_prop_mem;
|
||||
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
|
||||
allocate_gpu_memory);
|
||||
delete neg_rhs_array;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bitop_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
BITOP_TYPE op;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
case BITOR:
|
||||
case BITXOR:
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
{
|
||||
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
|
||||
if (op == BITOP_TYPE::BITAND) {
|
||||
// AND
|
||||
return lhs & rhs;
|
||||
} else if (op == BITOP_TYPE::BITOR) {
|
||||
// OR
|
||||
return lhs | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return lhs ^ rhs;
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Scalar OP
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
params.message_modulus, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
for (int i = 0; i < params.message_modulus; i++) {
|
||||
auto rhs = i;
|
||||
|
||||
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
|
||||
if (op == BITOP_TYPE::SCALAR_BITAND) {
|
||||
// AND
|
||||
return x & rhs;
|
||||
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
|
||||
// OR
|
||||
return x | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return x ^ rhs;
|
||||
}
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
|
||||
lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete lut;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
@@ -4160,9 +4282,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
// memory objects for other operations
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem_1;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem_2;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem_3;
|
||||
int_comparison_buffer<Torus> *comparison_buffer;
|
||||
|
||||
int_comparison_buffer<Torus> *comparison_buffer_1;
|
||||
int_comparison_buffer<Torus> *comparison_buffer_2;
|
||||
int_comparison_buffer<Torus> *comparison_buffer_3;
|
||||
int_sub_and_propagate<Torus> *sub_and_propagate_mem;
|
||||
int_bitop_buffer<Torus> *bitor_mem_1;
|
||||
int_bitop_buffer<Torus> *bitor_mem_2;
|
||||
int_bitop_buffer<Torus> *bitor_mem_3;
|
||||
// lookup tables
|
||||
int_radix_lut<Torus> **masking_luts_1;
|
||||
int_radix_lut<Torus> **masking_luts_2;
|
||||
@@ -4172,11 +4302,21 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
int_radix_lut<Torus> **zero_out_if_overflow_happened;
|
||||
int_radix_lut<Torus> **merge_overflow_flags_luts;
|
||||
|
||||
// lookup tables for 2_2 blocks
|
||||
int_radix_lut<Torus> *zero_out_if_not_1_lut;
|
||||
int_radix_lut<Torus> *zero_out_if_not_2_lut;
|
||||
int_radix_lut<Torus> *quotient_lut_1;
|
||||
int_radix_lut<Torus> *quotient_lut_2;
|
||||
int_radix_lut<Torus> *quotient_lut_3;
|
||||
|
||||
// sub streams
|
||||
cudaStream_t *sub_streams_1;
|
||||
cudaStream_t *sub_streams_2;
|
||||
cudaStream_t *sub_streams_3;
|
||||
cudaStream_t *sub_streams_4;
|
||||
cudaStream_t *sub_streams_5;
|
||||
cudaStream_t *sub_streams_6;
|
||||
cudaStream_t *sub_streams_7;
|
||||
|
||||
// temporary device buffers
|
||||
CudaRadixCiphertextFFI *remainder1;
|
||||
@@ -4197,6 +4337,34 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
CudaRadixCiphertextFFI *at_least_one_upper_block_is_non_zero;
|
||||
CudaRadixCiphertextFFI *cleaned_merged_interesting_remainder;
|
||||
|
||||
// temporary device buffers for 2_2 blocks
|
||||
CudaRadixCiphertextFFI *d1; // num_blocks + 1
|
||||
CudaRadixCiphertextFFI *d2; // num_blocks + 1
|
||||
CudaRadixCiphertextFFI *d3; // num_blocks + 1
|
||||
CudaRadixCiphertextFFI *low1; // num_blocks
|
||||
CudaRadixCiphertextFFI *low2; // num_blocks
|
||||
CudaRadixCiphertextFFI *low3; // num_blocks
|
||||
CudaRadixCiphertextFFI *rem; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_result_1; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_result_2; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_result_3; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_1_overflowed; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_2_overflowed; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_3_overflowed; // num_blocks
|
||||
CudaRadixCiphertextFFI *comparison_blocks_1; // num_blocks
|
||||
CudaRadixCiphertextFFI *comparison_blocks_2; // num_blocks
|
||||
CudaRadixCiphertextFFI *comparison_blocks_3; // num_blocks
|
||||
CudaRadixCiphertextFFI *cmp_1; // boolean block
|
||||
CudaRadixCiphertextFFI *cmp_2; // boolean block
|
||||
CudaRadixCiphertextFFI *cmp_3; // boolean block
|
||||
CudaRadixCiphertextFFI *c0; // single block
|
||||
CudaRadixCiphertextFFI *c1; // single block
|
||||
CudaRadixCiphertextFFI *c2; // single block
|
||||
CudaRadixCiphertextFFI *c3; // single block
|
||||
CudaRadixCiphertextFFI *q1; // single block
|
||||
CudaRadixCiphertextFFI *q2; // single block
|
||||
CudaRadixCiphertextFFI *q3; // single block
|
||||
|
||||
Torus **first_indexes_for_overflow_sub;
|
||||
Torus **second_indexes_for_overflow_sub;
|
||||
Torus **scalars_for_overflow_sub;
|
||||
@@ -4284,6 +4452,138 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// temporary arrays used in 2_2 blocks
|
||||
d1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], d1, num_blocks + 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], d2, num_blocks + 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], d3, num_blocks + 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
low1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], low1, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
low2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], low2, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
low3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], low3, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
rem = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], rem, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_result_1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_result_1, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_result_2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_result_2, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_result_3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_result_3, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_1_overflowed = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_1_overflowed, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_2_overflowed = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_2_overflowed, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_3_overflowed = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_3_overflowed, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
comparison_blocks_1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
comparison_blocks_2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
comparison_blocks_3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
cmp_1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
cmp_2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
cmp_3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c0 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
q1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
q2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
q3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
}
|
||||
|
||||
// initialize lookup tables for div_rem operation
|
||||
@@ -4332,12 +4632,44 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
message_extract_lut_2 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
zero_out_if_not_1_lut =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
zero_out_if_not_2_lut =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
quotient_lut_1 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
quotient_lut_2 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
quotient_lut_3 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
|
||||
auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
|
||||
Torus block = x / 2;
|
||||
bool condition = (x & 1) == 1;
|
||||
return block * (Torus)condition;
|
||||
};
|
||||
auto zero_out_if_not_2_lut_f = [](Torus x) -> Torus {
|
||||
Torus block = x / 3;
|
||||
bool condition = (x % 3) == 2;
|
||||
return block * (Torus)condition;
|
||||
};
|
||||
auto quotient_lut_1_f = [](Torus cond) -> Torus {
|
||||
return (Torus)(cond == 2);
|
||||
};
|
||||
auto quotient_lut_2_f = [](Torus cond) -> Torus {
|
||||
return (Torus)((cond == 2) * 2);
|
||||
};
|
||||
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
|
||||
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
|
||||
message_extract_lut_2};
|
||||
for (int j = 0; j < 2; j++) {
|
||||
@@ -4349,7 +4681,43 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], zero_out_if_not_1_lut->get_lut(0, 0),
|
||||
zero_out_if_not_1_lut->get_degree(0),
|
||||
zero_out_if_not_1_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_if_not_1_lut_f, gpu_memory_allocated);
|
||||
zero_out_if_not_1_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], zero_out_if_not_2_lut->get_lut(0, 0),
|
||||
zero_out_if_not_2_lut->get_degree(0),
|
||||
zero_out_if_not_2_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_if_not_2_lut_f, gpu_memory_allocated);
|
||||
zero_out_if_not_2_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], quotient_lut_1->get_lut(0, 0),
|
||||
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
|
||||
quotient_lut_1->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], quotient_lut_2->get_lut(0, 0),
|
||||
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
|
||||
quotient_lut_2->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], quotient_lut_3->get_lut(0, 0),
|
||||
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
|
||||
quotient_lut_3->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
auto overflow_happened = [](uint64_t overflow_sum) {
|
||||
return overflow_sum != 0;
|
||||
};
|
||||
@@ -4469,11 +4837,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
uint32_t compute_overflow = 1;
|
||||
overflow_sub_mem = new int_borrow_prop_memory<Torus>(
|
||||
overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
uint32_t group_size = overflow_sub_mem->group_size;
|
||||
bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem
|
||||
overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
uint32_t group_size = overflow_sub_mem_1->group_size;
|
||||
bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
|
||||
->use_sequential_algorithm_to_resolve_group_carries;
|
||||
create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
|
||||
group_size, use_seq, allocate_gpu_memory,
|
||||
@@ -4483,6 +4857,32 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
|
||||
comparison_buffer_1 = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
|
||||
comparison_buffer_2 = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
|
||||
comparison_buffer_3 = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
|
||||
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks + 1,
|
||||
outputFlag::FLAG_NONE, allocate_gpu_memory, size_tracker);
|
||||
|
||||
bitor_mem_1 = new int_bitop_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
bitor_mem_2 = new int_bitop_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
bitor_mem_3 = new int_bitop_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks,
|
||||
@@ -4496,11 +4896,20 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_4 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_5 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_6 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_7 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4607,11 +5016,11 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
// release and delete other operation memory objects
|
||||
shift_mem_1->release(streams, gpu_indexes, gpu_count);
|
||||
shift_mem_2->release(streams, gpu_indexes, gpu_count);
|
||||
overflow_sub_mem->release(streams, gpu_indexes, gpu_count);
|
||||
overflow_sub_mem_1->release(streams, gpu_indexes, gpu_count);
|
||||
comparison_buffer->release(streams, gpu_indexes, gpu_count);
|
||||
delete shift_mem_1;
|
||||
delete shift_mem_2;
|
||||
delete overflow_sub_mem;
|
||||
delete overflow_sub_mem_1;
|
||||
delete comparison_buffer;
|
||||
|
||||
// drop temporary buffers
|
||||
@@ -4750,89 +5159,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bitop_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
BITOP_TYPE op;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
case BITOR:
|
||||
case BITXOR:
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
{
|
||||
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
|
||||
if (op == BITOP_TYPE::BITAND) {
|
||||
// AND
|
||||
return lhs & rhs;
|
||||
} else if (op == BITOP_TYPE::BITOR) {
|
||||
// OR
|
||||
return lhs | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return lhs ^ rhs;
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Scalar OP
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
params.message_modulus, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
for (int i = 0; i < params.message_modulus; i++) {
|
||||
auto rhs = i;
|
||||
|
||||
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
|
||||
if (op == BITOP_TYPE::SCALAR_BITAND) {
|
||||
// AND
|
||||
return x & rhs;
|
||||
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
|
||||
// OR
|
||||
return x | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return x ^ rhs;
|
||||
}
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
|
||||
lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete lut;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_scalar_mul_buffer {
|
||||
int_radix_params params;
|
||||
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
|
||||
@@ -5204,45 +5530,6 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sub_and_propagate {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
|
||||
CudaRadixCiphertextFFI *neg_rhs_array;
|
||||
|
||||
int_sc_prop_memory<Torus> *sc_prop_mem;
|
||||
|
||||
int_sub_and_propagate(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
const int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->neg_rhs_array = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sc_prop_mem;
|
||||
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
|
||||
allocate_gpu_memory);
|
||||
delete neg_rhs_array;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/abs.cuh"
|
||||
#include "integer/cast.cuh"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
@@ -32,6 +33,412 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
|
||||
|
||||
// alias
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
auto d1 = mem_ptr->d1;
|
||||
auto d2 = mem_ptr->d2;
|
||||
auto d3 = mem_ptr->d3;
|
||||
auto low1 = mem_ptr->low1;
|
||||
auto low2 = mem_ptr->low2;
|
||||
auto low3 = mem_ptr->low3;
|
||||
auto rem = mem_ptr->rem;
|
||||
auto sub_result_1 = mem_ptr->sub_result_1;
|
||||
auto sub_1_overflowed = mem_ptr->sub_1_overflowed;
|
||||
auto sub_result_2 = mem_ptr->sub_result_2;
|
||||
auto sub_2_overflowed = mem_ptr->sub_2_overflowed;
|
||||
auto sub_result_3 = mem_ptr->sub_result_3;
|
||||
auto sub_3_overflowed = mem_ptr->sub_3_overflowed;
|
||||
// auto d4 = mem_ptr->d4;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder,
|
||||
numerator);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
quotient, 0, num_blocks);
|
||||
quotient->num_radix_blocks = 0;
|
||||
// Computes 2*d by extending and shifting
|
||||
auto extend_2xd_f = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
// d2 is allocated with num_blocks + 1; so we extend with 1.
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(d2, divisor, streams,
|
||||
gpu_indexes);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, d2, 1, mem_ptr->shift_mem_1, bsks,
|
||||
ksks, ms_noise_reduction_key, d2->num_radix_blocks);
|
||||
};
|
||||
|
||||
// Computes 3*d = 4*d - d using block shift and subtraction
|
||||
auto extend_3xd_f = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
// d1 is allocated with num_blocks + 1; so we extend with 1.
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(d1, divisor, streams,
|
||||
gpu_indexes);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count, d3,
|
||||
d1, 1, d1->num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], d3,
|
||||
0, 1);
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, d3, d1, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
outputFlag::FLAG_NONE, 0);
|
||||
// trim d1 by one msb block
|
||||
d1->num_radix_blocks -= 1;
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
extend_2xd_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
extend_3xd_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
print_body<Torus>("remainder", (Torus *)remainder->ptr,
|
||||
remainder->num_radix_blocks, radix_params.big_lwe_dimension,
|
||||
576460752303423488ULL);
|
||||
|
||||
for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
|
||||
uint32_t slice_len = num_blocks - block_index;
|
||||
|
||||
low1->num_radix_blocks = slice_len;
|
||||
low2->num_radix_blocks = slice_len;
|
||||
low3->num_radix_blocks = slice_len;
|
||||
rem->num_radix_blocks = slice_len;
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], low1,
|
||||
0, slice_len, d1, 0, slice_len);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], low2,
|
||||
0, slice_len, d2, 0, slice_len);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], low3,
|
||||
0, slice_len, d3, 0, slice_len);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], rem, 0,
|
||||
slice_len, remainder, block_index,
|
||||
num_blocks);
|
||||
|
||||
if (slice_len == 4) {
|
||||
print_body<Torus>("low1", (Torus *)low1->ptr, low1->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("low2", (Torus *)low2->ptr, low2->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("low3", (Torus *)low3->ptr, low3->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("rem", (Torus *)rem->ptr, rem->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
}
|
||||
uint32_t compute_borrow = 1;
|
||||
uint32_t uses_input_borrow = 0;
|
||||
auto sub_result_f = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *sub_result,
|
||||
CudaRadixCiphertextFFI *sub_overflowed,
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem,
|
||||
CudaRadixCiphertextFFI *low) {
|
||||
sub_result->num_radix_blocks = low->num_radix_blocks;
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, sub_result, rem, low, sub_overflowed,
|
||||
(const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, compute_borrow, uses_input_borrow);
|
||||
};
|
||||
|
||||
auto cmp_f = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *out_boolean_block,
|
||||
CudaRadixCiphertextFFI *comparison_blocks,
|
||||
CudaRadixCiphertextFFI *d,
|
||||
int_comparison_buffer<Torus> *comparison_buffer) {
|
||||
CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
|
||||
uint32_t slice_start = num_blocks - block_index;
|
||||
uint32_t slice_end = d->num_radix_blocks;
|
||||
as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
|
||||
comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
|
||||
are_all_comparisons_block_true(
|
||||
streams, gpu_indexes, gpu_count, out_boolean_block, comparison_blocks,
|
||||
comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
comparison_blocks->num_radix_blocks);
|
||||
|
||||
host_negation<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
|
||||
(Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
|
||||
// we calculate encoding because this block works only for message_modulus
|
||||
// = 4 and carry_modulus = 4.
|
||||
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
|
||||
(Torus *)out_boolean_block->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
delete d_msb;
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
sub_result_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
|
||||
mem_ptr->overflow_sub_mem_1, low3);
|
||||
sub_result_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
|
||||
mem_ptr->overflow_sub_mem_2, low2);
|
||||
sub_result_f(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
|
||||
mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
|
||||
mem_ptr->overflow_sub_mem_3, low1);
|
||||
cmp_f(mem_ptr->sub_streams_4, gpu_indexes, gpu_count, mem_ptr->cmp_1,
|
||||
mem_ptr->comparison_blocks_1, mem_ptr->d3,
|
||||
mem_ptr->comparison_buffer_1);
|
||||
cmp_f(mem_ptr->sub_streams_5, gpu_indexes, gpu_count, mem_ptr->cmp_2,
|
||||
mem_ptr->comparison_blocks_2, mem_ptr->d2,
|
||||
mem_ptr->comparison_buffer_2);
|
||||
cmp_f(mem_ptr->sub_streams_6, gpu_indexes, gpu_count, mem_ptr->cmp_3,
|
||||
mem_ptr->comparison_blocks_3, mem_ptr->d1,
|
||||
mem_ptr->comparison_buffer_3);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
auto r1 = mem_ptr->sub_result_3;
|
||||
auto r2 = mem_ptr->sub_result_2;
|
||||
auto r3 = mem_ptr->sub_result_1;
|
||||
|
||||
auto o1 = mem_ptr->sub_3_overflowed;
|
||||
auto o2 = mem_ptr->sub_2_overflowed;
|
||||
auto o3 = mem_ptr->sub_1_overflowed;
|
||||
|
||||
print_body<Torus>("r1", (Torus *)r1->ptr, r1->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("r2", (Torus *)r2->ptr, r2->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("r3", (Torus *)r3->ptr, r3->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
|
||||
o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
print_body<Torus>("o1", (Torus *)o1->ptr, 1, radix_params.big_lwe_dimension,
|
||||
576460752303423488ULL);
|
||||
print_body<Torus>("o2", (Torus *)o2->ptr, 1, radix_params.big_lwe_dimension,
|
||||
576460752303423488ULL);
|
||||
print_body<Torus>("o3", (Torus *)o3->ptr, 1, radix_params.big_lwe_dimension,
|
||||
576460752303423488ULL);
|
||||
|
||||
print_body<Torus>("cmp1", (Torus *)mem_ptr->cmp_1->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("cmp2", (Torus *)mem_ptr->cmp_2->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("cmp3", (Torus *)mem_ptr->cmp_3->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
// The cx variables tell whether the corresponding result of the subtraction
|
||||
// should be kept, and what value the quotient block should have
|
||||
//
|
||||
// for c3, c0; the block values are in [0, 1]
|
||||
// for c2, c1; the block values are in [0, 1, 2], 2 meaning true; 0,1
|
||||
// meaning false
|
||||
|
||||
// c3 = !o3
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c3, 0, 1, o3, 0, 1);
|
||||
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
|
||||
(Torus *)mem_ptr->c3->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
|
||||
(Torus *)mem_ptr->c3->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
// c2 = !o2 + o3
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c2, 0, 1, o2, 0, 1);
|
||||
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
|
||||
(Torus *)mem_ptr->c2->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
|
||||
(Torus *)mem_ptr->c2->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c2, mem_ptr->c2,
|
||||
o3, 1, 4, 4);
|
||||
|
||||
// c1 = !o1 + o2
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c1, 0, 1, o1, 0, 1);
|
||||
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
|
||||
(Torus *)mem_ptr->c1->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
|
||||
(Torus *)mem_ptr->c1->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c1, mem_ptr->c1,
|
||||
o2, 1, 4, 4);
|
||||
|
||||
// c0 = o1 (direct copy)
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c0, 0, 1, o1, 0, 1);
|
||||
|
||||
print_body<Torus>("c0", (Torus *)mem_ptr->c0->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("c1", (Torus *)mem_ptr->c1->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("c2", (Torus *)mem_ptr->c2->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("c3", (Torus *)mem_ptr->c3->ptr, 1,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
auto conditional_update =
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *cx,
|
||||
CudaRadixCiphertextFFI *rx, int_radix_lut<Torus> *lut,
|
||||
uint32_t factor) {
|
||||
// printf("rx->num_radix_blocks: %d\n", rx->num_radix_blocks);
|
||||
host_cleartext_multiplication<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)rx->ptr, (Torus *)rx->ptr, factor,
|
||||
radix_params.big_lwe_dimension, rx->num_radix_blocks);
|
||||
host_add_the_same_block_to_all_blocks<Torus>(streams[0], gpu_indexes[0], rx,
|
||||
rx, cx, 4, 4);
|
||||
|
||||
// print_body<Torus>("gpu_after_add_rem", (Torus *)rx->ptr, rx->num_radix_blocks,
|
||||
// radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rx, rx, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, rx->num_radix_blocks);
|
||||
|
||||
// print_body<Torus>("gpu_after_pbs_rem", (Torus *)rx->ptr, rx->num_radix_blocks,
|
||||
// radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
};
|
||||
|
||||
auto calculate_quotient_bits =
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *q,
|
||||
CudaRadixCiphertextFFI *c, int_radix_lut<Torus> *lut) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, q, c, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
};
|
||||
|
||||
// print_body<Torus>("gpu_before_rem", (Torus *)rem->ptr, rem->num_radix_blocks,
|
||||
// radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
conditional_update(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
mem_ptr->c3, r3, mem_ptr->zero_out_if_not_1_lut, 2);
|
||||
conditional_update(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
mem_ptr->c2, r2, mem_ptr->zero_out_if_not_2_lut, 3);
|
||||
conditional_update(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
|
||||
mem_ptr->c1, r1, mem_ptr->zero_out_if_not_2_lut, 3);
|
||||
conditional_update(mem_ptr->sub_streams_4, gpu_indexes, gpu_count,
|
||||
mem_ptr->c0, rem, mem_ptr->zero_out_if_not_1_lut, 2);
|
||||
|
||||
calculate_quotient_bits(mem_ptr->sub_streams_5, gpu_indexes, 1, mem_ptr->q1,
|
||||
mem_ptr->c1, mem_ptr->quotient_lut_1);
|
||||
calculate_quotient_bits(mem_ptr->sub_streams_6, gpu_indexes, 1, mem_ptr->q2,
|
||||
mem_ptr->c2, mem_ptr->quotient_lut_2);
|
||||
calculate_quotient_bits(mem_ptr->sub_streams_7, gpu_indexes, 1, mem_ptr->q3,
|
||||
mem_ptr->c3, mem_ptr->quotient_lut_3);
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_7[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
print_body<Torus>("gpu_after_r1", (Torus *)r1->ptr, r1->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("gpu_after_r2", (Torus *)r2->ptr, r2->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("gpu_after_r3", (Torus *)r3->ptr, r3->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("gpu_after_rem", (Torus *)rem->ptr, rem->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
print_body<Torus>("gpu_after_q1", (Torus *)mem_ptr->q1->ptr, mem_ptr->q1->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("gpu_after_q2", (Torus *)mem_ptr->q2->ptr, mem_ptr->q2->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
print_body<Torus>("gpu_after_q3", (Torus *)mem_ptr->q3->ptr, mem_ptr->q3->num_radix_blocks,
|
||||
radix_params.big_lwe_dimension, 576460752303423488ULL);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
|
||||
r3, rem->num_radix_blocks, 4, 4);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
|
||||
r2, rem->num_radix_blocks, 4, 4);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
|
||||
r1, rem->num_radix_blocks, 4, 4);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
|
||||
mem_ptr->q2, 1, 4, 4);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
|
||||
mem_ptr->q3, 1, 4, 4);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, rem, rem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_1, rem->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, mem_ptr->q1, mem_ptr->q1, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, 1);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
size_t tmp_rem_size = rem->num_radix_blocks;
|
||||
rem->num_radix_blocks = remainder->num_radix_blocks;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
remainder, block_index, rem->num_radix_blocks,
|
||||
rem, 0, tmp_rem_size);
|
||||
rem->num_radix_blocks = tmp_rem_size;
|
||||
|
||||
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1,
|
||||
quotient, 0);
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -50,6 +457,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
remainder->lwe_dimension != divisor->lwe_dimension ||
|
||||
remainder->lwe_dimension != quotient->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimension must be equal")
|
||||
|
||||
if (mem_ptr->params.message_modulus == 4 &&
|
||||
mem_ptr->params.carry_modulus == 4) {
|
||||
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
|
||||
divisor, bsks, ksks, ms_noise_reduction_key, mem_ptr);
|
||||
}
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
|
||||
@@ -310,14 +724,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
auto scalar_indexes =
|
||||
mem_ptr->scalars_for_overflow_sub
|
||||
[merged_interesting_remainder->num_radix_blocks - 1];
|
||||
mem_ptr->overflow_sub_mem->update_lut_indexes(
|
||||
mem_ptr->overflow_sub_mem_1->update_lut_indexes(
|
||||
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
|
||||
merged_interesting_remainder->num_radix_blocks);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder,
|
||||
merged_interesting_remainder, interesting_divisor,
|
||||
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
|
||||
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->overflow_sub_mem_1, bsks, ksks, ms_noise_reduction_key,
|
||||
compute_borrow, uses_input_borrow);
|
||||
};
|
||||
|
||||
|
||||
@@ -174,7 +174,29 @@ impl ServerKey {
|
||||
let low2 = RadixCiphertext::from(d2.blocks[..num_blocks - block_index].to_vec());
|
||||
let low3 = RadixCiphertext::from(d3.blocks[..num_blocks - block_index].to_vec());
|
||||
let mut rem = RadixCiphertext::from(remainder.blocks[block_index..].to_vec());
|
||||
println!("low1");
|
||||
for block in &low1.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("low2");
|
||||
for block in &low2.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("low3");
|
||||
for block in &low3.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("rem");
|
||||
for block in &rem.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
let (mut sub_results, cmps) = rayon::join(
|
||||
|| {
|
||||
[&low3, &low2, &low1]
|
||||
@@ -202,6 +224,24 @@ impl ServerKey {
|
||||
let (mut r2, mut o2) = sub_results.pop().unwrap();
|
||||
let (mut r3, mut o3) = sub_results.pop().unwrap();
|
||||
|
||||
println!("r1");
|
||||
for block in &r1.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("r2");
|
||||
for block in &r2.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("r3");
|
||||
for block in &r3.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
[&mut o3, &mut o2, &mut o1]
|
||||
.into_par_iter()
|
||||
.zip(cmps.par_iter())
|
||||
@@ -209,6 +249,13 @@ impl ServerKey {
|
||||
self.boolean_bitor_assign(ox, cmpx);
|
||||
});
|
||||
|
||||
println!("o1: {:?}", o1.0.ct.get_body().data);
|
||||
println!("o2: {:?}", o2.0.ct.get_body().data);
|
||||
println!("o3: {:?}", o3.0.ct.get_body().data);
|
||||
|
||||
println!("cmp1: {:?}", cmps[0].0.ct.get_body().data);
|
||||
println!("cmp2: {:?}", cmps[1].0.ct.get_body().data);
|
||||
println!("cmp3: {:?}", cmps[2].0.ct.get_body().data);
|
||||
// The cx variables tell whether the corresponding result of the subtraction
|
||||
// should be kept, and what value the quotient block should have
|
||||
//
|
||||
@@ -227,10 +274,20 @@ impl ServerKey {
|
||||
};
|
||||
let c0 = o1.0;
|
||||
|
||||
println!("c0: {:?}", c0.ct.get_body().data);
|
||||
println!("c1: {:?}", c1.ct.get_body().data);
|
||||
println!("c2: {:?}", c2.ct.get_body().data);
|
||||
println!("c3: {:?}", c3.ct.get_body().data);
|
||||
|
||||
// println!("cpu_before_rem");
|
||||
// for block in &rem.blocks {
|
||||
// println!("{:?}", block.ct.get_body().data);
|
||||
// }
|
||||
// println!();
|
||||
let (_, [q1, q2, q3]) = rayon::join(
|
||||
|| {
|
||||
[&c3, &c2, &c1, &c0]
|
||||
.into_par_iter()
|
||||
.into_iter()
|
||||
.zip([&mut r3, &mut r2, &mut r1, &mut rem])
|
||||
.zip([
|
||||
&zero_out_if_not_1_lut,
|
||||
@@ -241,9 +298,25 @@ impl ServerKey {
|
||||
.for_each(|((cx, rx), (lut, factor))| {
|
||||
// Manual zero_out_if to avoid noise problems
|
||||
rx.blocks.par_iter_mut().for_each(|block| {
|
||||
// println!("cpu_before_scalar_mul_rem");
|
||||
// println!("{:?}", block.ct.get_body().data);
|
||||
// println!();
|
||||
self.key.unchecked_scalar_mul_assign(block, *factor);
|
||||
|
||||
// println!("factor: {:?}", factor);
|
||||
// println!("cpu_after_scalar_mul_rem");
|
||||
// println!("{:?}", block.ct.get_body().data);
|
||||
// println!();
|
||||
|
||||
self.key.unchecked_add_assign(block, cx);
|
||||
// println!("cpu_after_add_rem");
|
||||
// println!("{:?}", block.ct.get_body().data);
|
||||
// println!();
|
||||
|
||||
self.key.apply_lookup_table_assign(block, lut);
|
||||
// println!("cpu_after_pbs_rem");
|
||||
// println!("{:?}", block.ct.get_body().data);
|
||||
// println!();
|
||||
});
|
||||
});
|
||||
},
|
||||
@@ -258,6 +331,33 @@ impl ServerKey {
|
||||
},
|
||||
);
|
||||
|
||||
println!("cpu_after_r1");
|
||||
for block in &r1.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("cpu_after_r2");
|
||||
for block in &r2.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("cpu_after_r3");
|
||||
for block in &r3.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("cpu_after_rem");
|
||||
for block in &rem.blocks {
|
||||
println!("{:?}", block.ct.get_body().data);
|
||||
}
|
||||
println!();
|
||||
|
||||
println!("cpu_after_q1: {:?}", q1.ct.get_body().data);
|
||||
println!("cpu_after_q2: {:?}", q2.ct.get_body().data);
|
||||
println!("cpu_after_q3: {:?}", q3.ct.get_body().data);
|
||||
// Only one of rx and rem is not zero
|
||||
for rx in [&r3, &r2, &r1] {
|
||||
self.unchecked_add_assign(&mut rem, rx);
|
||||
|
||||
Reference in New Issue
Block a user