feat(gpu): div_rem_2_2_blocks

This commit is contained in:
Beka Barbakadze
2025-08-01 17:37:59 +04:00
parent 71f427de9e
commit 6cfe535508
3 changed files with 934 additions and 133 deletions

View File

@@ -4153,6 +4153,128 @@ template <typename Torus> struct int_comparison_buffer {
}
};
template <typename Torus> struct int_sub_and_propagate {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *neg_rhs_array;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sub_and_propagate(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
this->neg_rhs_array = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
delete sc_prop_mem;
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
allocate_gpu_memory);
delete neg_rhs_array;
}
};
template <typename Torus> struct int_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
BITOP_TYPE op;
bool gpu_memory_allocated;
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
switch (op) {
case BITAND:
case BITOR:
case BITXOR:
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
{
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == BITOP_TYPE::BITAND) {
// AND
return lhs & rhs;
} else if (op == BITOP_TYPE::BITOR) {
// OR
return lhs | rhs;
} else {
// XOR
return lhs ^ rhs;
}
};
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
break;
default:
// Scalar OP
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
params.message_modulus, num_radix_blocks,
allocate_gpu_memory, size_tracker);
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
if (op == BITOP_TYPE::SCALAR_BITAND) {
// AND
return x & rhs;
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
// OR
return x | rhs;
} else {
// XOR
return x ^ rhs;
}
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, gpu_count);
delete lut;
}
};
template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_params params;
uint32_t active_gpu_count;
@@ -4160,9 +4282,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// memory objects for other operations
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
int_borrow_prop_memory<Torus> *overflow_sub_mem;
int_borrow_prop_memory<Torus> *overflow_sub_mem_1;
int_borrow_prop_memory<Torus> *overflow_sub_mem_2;
int_borrow_prop_memory<Torus> *overflow_sub_mem_3;
int_comparison_buffer<Torus> *comparison_buffer;
int_comparison_buffer<Torus> *comparison_buffer_1;
int_comparison_buffer<Torus> *comparison_buffer_2;
int_comparison_buffer<Torus> *comparison_buffer_3;
int_sub_and_propagate<Torus> *sub_and_propagate_mem;
int_bitop_buffer<Torus> *bitor_mem_1;
int_bitop_buffer<Torus> *bitor_mem_2;
int_bitop_buffer<Torus> *bitor_mem_3;
// lookup tables
int_radix_lut<Torus> **masking_luts_1;
int_radix_lut<Torus> **masking_luts_2;
@@ -4172,11 +4302,21 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> **zero_out_if_overflow_happened;
int_radix_lut<Torus> **merge_overflow_flags_luts;
// lookup tables for 2_2 blocks
int_radix_lut<Torus> *zero_out_if_not_1_lut;
int_radix_lut<Torus> *zero_out_if_not_2_lut;
int_radix_lut<Torus> *quotient_lut_1;
int_radix_lut<Torus> *quotient_lut_2;
int_radix_lut<Torus> *quotient_lut_3;
// sub streams
cudaStream_t *sub_streams_1;
cudaStream_t *sub_streams_2;
cudaStream_t *sub_streams_3;
cudaStream_t *sub_streams_4;
cudaStream_t *sub_streams_5;
cudaStream_t *sub_streams_6;
cudaStream_t *sub_streams_7;
// temporary device buffers
CudaRadixCiphertextFFI *remainder1;
@@ -4197,6 +4337,34 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
CudaRadixCiphertextFFI *at_least_one_upper_block_is_non_zero;
CudaRadixCiphertextFFI *cleaned_merged_interesting_remainder;
// temporary device buffers for 2_2 blocks
CudaRadixCiphertextFFI *d1; // num_blocks + 1
CudaRadixCiphertextFFI *d2; // num_blocks + 1
CudaRadixCiphertextFFI *d3; // num_blocks + 1
CudaRadixCiphertextFFI *low1; // num_blocks
CudaRadixCiphertextFFI *low2; // num_blocks
CudaRadixCiphertextFFI *low3; // num_blocks
CudaRadixCiphertextFFI *rem; // num_blocks
CudaRadixCiphertextFFI *sub_result_1; // num_blocks
CudaRadixCiphertextFFI *sub_result_2; // num_blocks
CudaRadixCiphertextFFI *sub_result_3; // num_blocks
CudaRadixCiphertextFFI *sub_1_overflowed; // num_blocks
CudaRadixCiphertextFFI *sub_2_overflowed; // num_blocks
CudaRadixCiphertextFFI *sub_3_overflowed; // num_blocks
CudaRadixCiphertextFFI *comparison_blocks_1; // num_blocks
CudaRadixCiphertextFFI *comparison_blocks_2; // num_blocks
CudaRadixCiphertextFFI *comparison_blocks_3; // num_blocks
CudaRadixCiphertextFFI *cmp_1; // boolean block
CudaRadixCiphertextFFI *cmp_2; // boolean block
CudaRadixCiphertextFFI *cmp_3; // boolean block
CudaRadixCiphertextFFI *c0; // single block
CudaRadixCiphertextFFI *c1; // single block
CudaRadixCiphertextFFI *c2; // single block
CudaRadixCiphertextFFI *c3; // single block
CudaRadixCiphertextFFI *q1; // single block
CudaRadixCiphertextFFI *q2; // single block
CudaRadixCiphertextFFI *q3; // single block
Torus **first_indexes_for_overflow_sub;
Torus **second_indexes_for_overflow_sub;
Torus **scalars_for_overflow_sub;
@@ -4284,6 +4452,138 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// temporary arrays used in 2_2 blocks
d1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], d1, num_blocks + 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
d2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], d2, num_blocks + 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
d3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], d3, num_blocks + 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
low1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], low1, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
low2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], low2, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
low3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], low3, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
rem = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], rem, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
sub_result_1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_result_1, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_result_2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_result_2, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_result_3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_result_3, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_1_overflowed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_1_overflowed, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_2_overflowed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_2_overflowed, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_3_overflowed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_3_overflowed, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
comparison_blocks_1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
comparison_blocks_2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
comparison_blocks_3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
cmp_1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
cmp_2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
cmp_3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c0 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
q1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
q2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
q3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
}
// initialize lookup tables for div_rem operation
@@ -4332,12 +4632,44 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
message_extract_lut_2 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_not_1_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_not_2_lut =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
quotient_lut_1 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
allocate_gpu_memory, size_tracker);
quotient_lut_2 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
allocate_gpu_memory, size_tracker);
quotient_lut_3 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
allocate_gpu_memory, size_tracker);
auto message_modulus = params.message_modulus;
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
Torus block = x / 2;
bool condition = (x & 1) == 1;
return block * (Torus)condition;
};
auto zero_out_if_not_2_lut_f = [](Torus x) -> Torus {
Torus block = x / 3;
bool condition = (x % 3) == 2;
return block * (Torus)condition;
};
auto quotient_lut_1_f = [](Torus cond) -> Torus {
return (Torus)(cond == 2);
};
auto quotient_lut_2_f = [](Torus cond) -> Torus {
return (Torus)((cond == 2) * 2);
};
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
for (int j = 0; j < 2; j++) {
@@ -4349,7 +4681,43 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
luts[j]->broadcast_lut(streams, gpu_indexes);
}
// Give name to closures to improve readability
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], zero_out_if_not_1_lut->get_lut(0, 0),
zero_out_if_not_1_lut->get_degree(0),
zero_out_if_not_1_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_if_not_1_lut_f, gpu_memory_allocated);
zero_out_if_not_1_lut->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], zero_out_if_not_2_lut->get_lut(0, 0),
zero_out_if_not_2_lut->get_degree(0),
zero_out_if_not_2_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_if_not_2_lut_f, gpu_memory_allocated);
zero_out_if_not_2_lut->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], quotient_lut_1->get_lut(0, 0),
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
quotient_lut_1->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], quotient_lut_2->get_lut(0, 0),
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
quotient_lut_2->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], quotient_lut_3->get_lut(0, 0),
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
quotient_lut_3->broadcast_lut(streams, gpu_indexes);
auto overflow_happened = [](uint64_t overflow_sum) {
return overflow_sum != 0;
};
@@ -4469,11 +4837,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
uint32_t compute_overflow = 1;
overflow_sub_mem = new int_borrow_prop_memory<Torus>(
overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
uint32_t group_size = overflow_sub_mem->group_size;
bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem
overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
uint32_t group_size = overflow_sub_mem_1->group_size;
bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
->use_sequential_algorithm_to_resolve_group_carries;
create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
group_size, use_seq, allocate_gpu_memory,
@@ -4483,6 +4857,32 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
comparison_buffer_1 = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
comparison_buffer_2 = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
comparison_buffer_3 = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks + 1,
outputFlag::FLAG_NONE, allocate_gpu_memory, size_tracker);
bitor_mem_1 = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
bitor_mem_2 = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
bitor_mem_3 = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
allocate_gpu_memory, size_tracker);
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks,
@@ -4496,11 +4896,20 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_4 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_5 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_6 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_7 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
}
}
@@ -4607,11 +5016,11 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// release and delete other operation memory objects
shift_mem_1->release(streams, gpu_indexes, gpu_count);
shift_mem_2->release(streams, gpu_indexes, gpu_count);
overflow_sub_mem->release(streams, gpu_indexes, gpu_count);
overflow_sub_mem_1->release(streams, gpu_indexes, gpu_count);
comparison_buffer->release(streams, gpu_indexes, gpu_count);
delete shift_mem_1;
delete shift_mem_2;
delete overflow_sub_mem;
delete overflow_sub_mem_1;
delete comparison_buffer;
// drop temporary buffers
@@ -4750,89 +5159,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
}
};
template <typename Torus> struct int_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
BITOP_TYPE op;
bool gpu_memory_allocated;
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
switch (op) {
case BITAND:
case BITOR:
case BITXOR:
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
{
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == BITOP_TYPE::BITAND) {
// AND
return lhs & rhs;
} else if (op == BITOP_TYPE::BITOR) {
// OR
return lhs | rhs;
} else {
// XOR
return lhs ^ rhs;
}
};
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
break;
default:
// Scalar OP
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
params.message_modulus, num_radix_blocks,
allocate_gpu_memory, size_tracker);
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
if (op == BITOP_TYPE::SCALAR_BITAND) {
// AND
return x & rhs;
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
// OR
return x | rhs;
} else {
// XOR
return x ^ rhs;
}
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, gpu_count);
delete lut;
}
};
template <typename Torus> struct int_scalar_mul_buffer {
int_radix_params params;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
@@ -5204,45 +5530,6 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
}
};
template <typename Torus> struct int_sub_and_propagate {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *neg_rhs_array;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sub_and_propagate(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
this->neg_rhs_array = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
delete sc_prop_mem;
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
allocate_gpu_memory);
delete neg_rhs_array;
}
};
template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
int_radix_params params;

View File

@@ -4,6 +4,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/abs.cuh"
#include "integer/cast.cuh"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
@@ -32,6 +33,412 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
return size_tracker;
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
// alias
auto radix_params = mem_ptr->params;
auto num_blocks = quotient->num_radix_blocks;
auto d1 = mem_ptr->d1;
auto d2 = mem_ptr->d2;
auto d3 = mem_ptr->d3;
auto low1 = mem_ptr->low1;
auto low2 = mem_ptr->low2;
auto low3 = mem_ptr->low3;
auto rem = mem_ptr->rem;
auto sub_result_1 = mem_ptr->sub_result_1;
auto sub_1_overflowed = mem_ptr->sub_1_overflowed;
auto sub_result_2 = mem_ptr->sub_result_2;
auto sub_2_overflowed = mem_ptr->sub_2_overflowed;
auto sub_result_3 = mem_ptr->sub_result_3;
auto sub_3_overflowed = mem_ptr->sub_3_overflowed;
// auto d4 = mem_ptr->d4;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder,
numerator);
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
quotient, 0, num_blocks);
quotient->num_radix_blocks = 0;
// Computes 2*d by extending and shifting
auto extend_2xd_f = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
// d2 is allocated with num_blocks + 1; so we extend with 1.
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(d2, divisor, streams,
gpu_indexes);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, d2, 1, mem_ptr->shift_mem_1, bsks,
ksks, ms_noise_reduction_key, d2->num_radix_blocks);
};
// Computes 3*d = 4*d - d using block shift and subtraction
auto extend_3xd_f = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
// d1 is allocated with num_blocks + 1; so we extend with 1.
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(d1, divisor, streams,
gpu_indexes);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count, d3,
d1, 1, d1->num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], d3,
0, 1);
host_sub_and_propagate_single_carry(
streams, gpu_indexes, gpu_count, d3, d1, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
outputFlag::FLAG_NONE, 0);
// trim d1 by one msb block
d1->num_radix_blocks -= 1;
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
extend_2xd_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
extend_3xd_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
print_body<Torus>("remainder", (Torus *)remainder->ptr,
remainder->num_radix_blocks, radix_params.big_lwe_dimension,
576460752303423488ULL);
for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
uint32_t slice_len = num_blocks - block_index;
low1->num_radix_blocks = slice_len;
low2->num_radix_blocks = slice_len;
low3->num_radix_blocks = slice_len;
rem->num_radix_blocks = slice_len;
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], low1,
0, slice_len, d1, 0, slice_len);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], low2,
0, slice_len, d2, 0, slice_len);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], low3,
0, slice_len, d3, 0, slice_len);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], rem, 0,
slice_len, remainder, block_index,
num_blocks);
if (slice_len == 4) {
print_body<Torus>("low1", (Torus *)low1->ptr, low1->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("low2", (Torus *)low2->ptr, low2->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("low3", (Torus *)low3->ptr, low3->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("rem", (Torus *)rem->ptr, rem->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
}
uint32_t compute_borrow = 1;
uint32_t uses_input_borrow = 0;
auto sub_result_f = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *sub_result,
CudaRadixCiphertextFFI *sub_overflowed,
int_borrow_prop_memory<Torus> *overflow_sub_mem,
CudaRadixCiphertextFFI *low) {
sub_result->num_radix_blocks = low->num_radix_blocks;
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, sub_result, rem, low, sub_overflowed,
(const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem, bsks, ksks,
ms_noise_reduction_key, compute_borrow, uses_input_borrow);
};
auto cmp_f = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
CudaRadixCiphertextFFI *out_boolean_block,
CudaRadixCiphertextFFI *comparison_blocks,
CudaRadixCiphertextFFI *d,
int_comparison_buffer<Torus> *comparison_buffer) {
CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
uint32_t slice_start = num_blocks - block_index;
uint32_t slice_end = d->num_radix_blocks;
as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
host_compare_blocks_with_zero<Torus>(
streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
comparison_buffer, bsks, ksks, ms_noise_reduction_key,
d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
are_all_comparisons_block_true(
streams, gpu_indexes, gpu_count, out_boolean_block, comparison_blocks,
comparison_buffer, bsks, ksks, ms_noise_reduction_key,
comparison_blocks->num_radix_blocks);
host_negation<Torus>(
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
(Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
// we calculate encoding because this block works only for message_modulus
// = 4 and carry_modulus = 4.
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
(Torus *)out_boolean_block->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
delete d_msb;
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
sub_result_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
mem_ptr->overflow_sub_mem_1, low3);
sub_result_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
mem_ptr->overflow_sub_mem_2, low2);
sub_result_f(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
mem_ptr->overflow_sub_mem_3, low1);
cmp_f(mem_ptr->sub_streams_4, gpu_indexes, gpu_count, mem_ptr->cmp_1,
mem_ptr->comparison_blocks_1, mem_ptr->d3,
mem_ptr->comparison_buffer_1);
cmp_f(mem_ptr->sub_streams_5, gpu_indexes, gpu_count, mem_ptr->cmp_2,
mem_ptr->comparison_blocks_2, mem_ptr->d2,
mem_ptr->comparison_buffer_2);
cmp_f(mem_ptr->sub_streams_6, gpu_indexes, gpu_count, mem_ptr->cmp_3,
mem_ptr->comparison_blocks_3, mem_ptr->d1,
mem_ptr->comparison_buffer_3);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
}
auto r1 = mem_ptr->sub_result_3;
auto r2 = mem_ptr->sub_result_2;
auto r3 = mem_ptr->sub_result_1;
auto o1 = mem_ptr->sub_3_overflowed;
auto o2 = mem_ptr->sub_2_overflowed;
auto o3 = mem_ptr->sub_1_overflowed;
print_body<Torus>("r1", (Torus *)r1->ptr, r1->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("r2", (Torus *)r2->ptr, r2->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("r3", (Torus *)r3->ptr, r3->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
// used as a bitor
host_integer_radix_bitop_kb(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
bsks, ksks, ms_noise_reduction_key);
// used as a bitor
host_integer_radix_bitop_kb(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
bsks, ksks, ms_noise_reduction_key);
// used as a bitor
host_integer_radix_bitop_kb(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
bsks, ksks, ms_noise_reduction_key);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
print_body<Torus>("o1", (Torus *)o1->ptr, 1, radix_params.big_lwe_dimension,
576460752303423488ULL);
print_body<Torus>("o2", (Torus *)o2->ptr, 1, radix_params.big_lwe_dimension,
576460752303423488ULL);
print_body<Torus>("o3", (Torus *)o3->ptr, 1, radix_params.big_lwe_dimension,
576460752303423488ULL);
print_body<Torus>("cmp1", (Torus *)mem_ptr->cmp_1->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("cmp2", (Torus *)mem_ptr->cmp_2->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("cmp3", (Torus *)mem_ptr->cmp_3->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
// The cx variables tell whether the corresponding result of the subtraction
// should be kept, and what value the quotient block should have
//
// for c3, c0; the block values are in [0, 1]
// for c2, c1; the block values are in [0, 1, 2], 2 meaning true; 0,1
// meaning false
// c3 = !o3
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c3, 0, 1, o3, 0, 1);
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
(Torus *)mem_ptr->c3->ptr,
radix_params.big_lwe_dimension, 1);
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
(Torus *)mem_ptr->c3->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
// c2 = !o2 + o3
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c2, 0, 1, o2, 0, 1);
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
(Torus *)mem_ptr->c2->ptr,
radix_params.big_lwe_dimension, 1);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
(Torus *)mem_ptr->c2->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c2, mem_ptr->c2,
o3, 1, 4, 4);
// c1 = !o1 + o2
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c1, 0, 1, o1, 0, 1);
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
(Torus *)mem_ptr->c1->ptr,
radix_params.big_lwe_dimension, 1);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
(Torus *)mem_ptr->c1->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c1, mem_ptr->c1,
o2, 1, 4, 4);
// c0 = o1 (direct copy)
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c0, 0, 1, o1, 0, 1);
print_body<Torus>("c0", (Torus *)mem_ptr->c0->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("c1", (Torus *)mem_ptr->c1->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("c2", (Torus *)mem_ptr->c2->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("c3", (Torus *)mem_ptr->c3->ptr, 1,
radix_params.big_lwe_dimension, 576460752303423488ULL);
auto conditional_update =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *cx,
CudaRadixCiphertextFFI *rx, int_radix_lut<Torus> *lut,
uint32_t factor) {
// printf("rx->num_radix_blocks: %d\n", rx->num_radix_blocks);
host_cleartext_multiplication<Torus>(
streams[0], gpu_indexes[0], (Torus *)rx->ptr, (Torus *)rx->ptr, factor,
radix_params.big_lwe_dimension, rx->num_radix_blocks);
host_add_the_same_block_to_all_blocks<Torus>(streams[0], gpu_indexes[0], rx,
rx, cx, 4, 4);
// print_body<Torus>("gpu_after_add_rem", (Torus *)rx->ptr, rx->num_radix_blocks,
// radix_params.big_lwe_dimension, 576460752303423488ULL);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, rx, rx, bsks, ksks,
ms_noise_reduction_key, lut, rx->num_radix_blocks);
// print_body<Torus>("gpu_after_pbs_rem", (Torus *)rx->ptr, rx->num_radix_blocks,
// radix_params.big_lwe_dimension, 576460752303423488ULL);
};
auto calculate_quotient_bits =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *q,
CudaRadixCiphertextFFI *c, int_radix_lut<Torus> *lut) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, q, c, bsks, ksks,
ms_noise_reduction_key, lut, 1);
};
// print_body<Torus>("gpu_before_rem", (Torus *)rem->ptr, rem->num_radix_blocks,
// radix_params.big_lwe_dimension, 576460752303423488ULL);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
conditional_update(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
mem_ptr->c3, r3, mem_ptr->zero_out_if_not_1_lut, 2);
conditional_update(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
mem_ptr->c2, r2, mem_ptr->zero_out_if_not_2_lut, 3);
conditional_update(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
mem_ptr->c1, r1, mem_ptr->zero_out_if_not_2_lut, 3);
conditional_update(mem_ptr->sub_streams_4, gpu_indexes, gpu_count,
mem_ptr->c0, rem, mem_ptr->zero_out_if_not_1_lut, 2);
calculate_quotient_bits(mem_ptr->sub_streams_5, gpu_indexes, 1, mem_ptr->q1,
mem_ptr->c1, mem_ptr->quotient_lut_1);
calculate_quotient_bits(mem_ptr->sub_streams_6, gpu_indexes, 1, mem_ptr->q2,
mem_ptr->c2, mem_ptr->quotient_lut_2);
calculate_quotient_bits(mem_ptr->sub_streams_7, gpu_indexes, 1, mem_ptr->q3,
mem_ptr->c3, mem_ptr->quotient_lut_3);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_7[j], gpu_indexes[j]);
}
print_body<Torus>("gpu_after_r1", (Torus *)r1->ptr, r1->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("gpu_after_r2", (Torus *)r2->ptr, r2->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("gpu_after_r3", (Torus *)r3->ptr, r3->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("gpu_after_rem", (Torus *)rem->ptr, rem->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("gpu_after_q1", (Torus *)mem_ptr->q1->ptr, mem_ptr->q1->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("gpu_after_q2", (Torus *)mem_ptr->q2->ptr, mem_ptr->q2->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
print_body<Torus>("gpu_after_q3", (Torus *)mem_ptr->q3->ptr, mem_ptr->q3->num_radix_blocks,
radix_params.big_lwe_dimension, 576460752303423488ULL);
host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
r3, rem->num_radix_blocks, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
r2, rem->num_radix_blocks, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], rem, rem,
r1, rem->num_radix_blocks, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
mem_ptr->q2, 1, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
mem_ptr->q3, 1, 4, 4);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, rem, rem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_1, rem->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, mem_ptr->q1, mem_ptr->q1, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, 1);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
size_t tmp_rem_size = rem->num_radix_blocks;
rem->num_radix_blocks = remainder->num_radix_blocks;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0],
remainder, block_index, rem->num_radix_blocks,
rem, 0, tmp_rem_size);
rem->num_radix_blocks = tmp_rem_size;
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1,
quotient, 0);
}
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -50,6 +457,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
remainder->lwe_dimension != divisor->lwe_dimension ||
remainder->lwe_dimension != quotient->lwe_dimension)
PANIC("Cuda error: input and output lwe dimension must be equal")
if (mem_ptr->params.message_modulus == 4 &&
mem_ptr->params.carry_modulus == 4) {
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
divisor, bsks, ksks, ms_noise_reduction_key, mem_ptr);
}
auto radix_params = mem_ptr->params;
auto num_blocks = quotient->num_radix_blocks;
@@ -310,14 +724,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto scalar_indexes =
mem_ptr->scalars_for_overflow_sub
[merged_interesting_remainder->num_radix_blocks - 1];
mem_ptr->overflow_sub_mem->update_lut_indexes(
mem_ptr->overflow_sub_mem_1->update_lut_indexes(
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
merged_interesting_remainder->num_radix_blocks);
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, new_remainder,
merged_interesting_remainder, interesting_divisor,
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->overflow_sub_mem_1, bsks, ksks, ms_noise_reduction_key,
compute_borrow, uses_input_borrow);
};

View File

@@ -174,7 +174,29 @@ impl ServerKey {
let low2 = RadixCiphertext::from(d2.blocks[..num_blocks - block_index].to_vec());
let low3 = RadixCiphertext::from(d3.blocks[..num_blocks - block_index].to_vec());
let mut rem = RadixCiphertext::from(remainder.blocks[block_index..].to_vec());
println!("low1");
for block in &low1.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("low2");
for block in &low2.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("low3");
for block in &low3.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("rem");
for block in &rem.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
let (mut sub_results, cmps) = rayon::join(
|| {
[&low3, &low2, &low1]
@@ -202,6 +224,24 @@ impl ServerKey {
let (mut r2, mut o2) = sub_results.pop().unwrap();
let (mut r3, mut o3) = sub_results.pop().unwrap();
println!("r1");
for block in &r1.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("r2");
for block in &r2.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("r3");
for block in &r3.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
[&mut o3, &mut o2, &mut o1]
.into_par_iter()
.zip(cmps.par_iter())
@@ -209,6 +249,13 @@ impl ServerKey {
self.boolean_bitor_assign(ox, cmpx);
});
println!("o1: {:?}", o1.0.ct.get_body().data);
println!("o2: {:?}", o2.0.ct.get_body().data);
println!("o3: {:?}", o3.0.ct.get_body().data);
println!("cmp1: {:?}", cmps[0].0.ct.get_body().data);
println!("cmp2: {:?}", cmps[1].0.ct.get_body().data);
println!("cmp3: {:?}", cmps[2].0.ct.get_body().data);
// The cx variables tell whether the corresponding result of the subtraction
// should be kept, and what value the quotient block should have
//
@@ -227,10 +274,20 @@ impl ServerKey {
};
let c0 = o1.0;
println!("c0: {:?}", c0.ct.get_body().data);
println!("c1: {:?}", c1.ct.get_body().data);
println!("c2: {:?}", c2.ct.get_body().data);
println!("c3: {:?}", c3.ct.get_body().data);
// println!("cpu_before_rem");
// for block in &rem.blocks {
// println!("{:?}", block.ct.get_body().data);
// }
// println!();
let (_, [q1, q2, q3]) = rayon::join(
|| {
[&c3, &c2, &c1, &c0]
.into_par_iter()
.into_iter()
.zip([&mut r3, &mut r2, &mut r1, &mut rem])
.zip([
&zero_out_if_not_1_lut,
@@ -241,9 +298,25 @@ impl ServerKey {
.for_each(|((cx, rx), (lut, factor))| {
// Manual zero_out_if to avoid noise problems
rx.blocks.par_iter_mut().for_each(|block| {
// println!("cpu_before_scalar_mul_rem");
// println!("{:?}", block.ct.get_body().data);
// println!();
self.key.unchecked_scalar_mul_assign(block, *factor);
// println!("factor: {:?}", factor);
// println!("cpu_after_scalar_mul_rem");
// println!("{:?}", block.ct.get_body().data);
// println!();
self.key.unchecked_add_assign(block, cx);
// println!("cpu_after_add_rem");
// println!("{:?}", block.ct.get_body().data);
// println!();
self.key.apply_lookup_table_assign(block, lut);
// println!("cpu_after_pbs_rem");
// println!("{:?}", block.ct.get_body().data);
// println!();
});
});
},
@@ -258,6 +331,33 @@ impl ServerKey {
},
);
println!("cpu_after_r1");
for block in &r1.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("cpu_after_r2");
for block in &r2.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("cpu_after_r3");
for block in &r3.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("cpu_after_rem");
for block in &rem.blocks {
println!("{:?}", block.ct.get_body().data);
}
println!();
println!("cpu_after_q1: {:?}", q1.ct.get_body().data);
println!("cpu_after_q2: {:?}", q2.ct.get_body().data);
println!("cpu_after_q3: {:?}", q3.ct.get_body().data);
// Only one of rx and rem is not zero
for rx in [&r3, &r2, &r1] {
self.unchecked_add_assign(&mut rem, rx);