mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
15 Commits
pa/feat/zk
...
as/lut_cac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4ea48165b | ||
|
|
0a6b62627d | ||
|
|
6deeb66bf8 | ||
|
|
17022dae69 | ||
|
|
09802dd5ee | ||
|
|
e3fe433a35 | ||
|
|
2bea35a3b5 | ||
|
|
e2bf226276 | ||
|
|
c66f1c6d8b | ||
|
|
9bfe190ad3 | ||
|
|
e40070db0e | ||
|
|
e8d5ceac68 | ||
|
|
f1526b29d8 | ||
|
|
602e0c5a19 | ||
|
|
163c1eeffb |
@@ -29,15 +29,13 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus, Torus)> and_lambda =
|
||||
[](Torus a, Torus b) -> Torus { return a & b; };
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
|
||||
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, and_lambda, allocate_gpu_memory);
|
||||
|
||||
auto active_streams_and_lut = streams.active_gpu_subset(
|
||||
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
|
||||
params.pbs_type);
|
||||
this->and_lut->broadcast_lut(active_streams_and_lut);
|
||||
this->and_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
|
||||
|
||||
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
this->flush_lut = new int_radix_lut<Torus>(
|
||||
@@ -46,14 +44,11 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
|
||||
return x & 1;
|
||||
};
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
|
||||
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, flush_lambda, allocate_gpu_memory);
|
||||
|
||||
auto active_streams_flush_lut = streams.active_gpu_subset(
|
||||
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
|
||||
this->flush_lut->broadcast_lut(active_streams_flush_lut);
|
||||
this->flush_lut->generate_and_broadcast_lut(
|
||||
active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
|
||||
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
this->carry_lut = new int_radix_lut<Torus>(
|
||||
@@ -61,14 +56,11 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
|
||||
return (x >> 1) & 1;
|
||||
};
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
|
||||
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_lambda, allocate_gpu_memory);
|
||||
|
||||
auto active_streams_carry_lut =
|
||||
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
|
||||
this->carry_lut->broadcast_lut(active_streams_carry_lut);
|
||||
this->carry_lut->generate_and_broadcast_lut(
|
||||
active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
|
||||
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
}
|
||||
|
||||
|
||||
@@ -65,14 +65,8 @@ template <typename Torus> struct boolean_bitop_buffer {
|
||||
return x % params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0),
|
||||
message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message_extract, gpu_memory_allocated);
|
||||
message_extract_lut->broadcast_lut(active_streams);
|
||||
message_extract_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
}
|
||||
tmp_lwe_left = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -142,12 +136,8 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(active_streams);
|
||||
lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -156,6 +146,8 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
for (int i = 0; i < params.message_modulus; i++) {
|
||||
auto rhs = i;
|
||||
|
||||
@@ -171,14 +163,13 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
return x ^ rhs;
|
||||
}
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
|
||||
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
lut_funcs.push_back(lut_univariate_scalar_f);
|
||||
lut_indices.push_back(i);
|
||||
}
|
||||
|
||||
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,16 +202,11 @@ template <typename Torus> struct boolean_bitnot_buffer {
|
||||
return x % message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0),
|
||||
message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message_extract, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
|
||||
message_extract_lut->broadcast_lut(active_streams);
|
||||
|
||||
message_extract_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -28,21 +28,17 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
|
||||
uint32_t bits_per_block = std::log2(params.message_modulus);
|
||||
uint32_t msg_modulus = params.message_modulus;
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
[msg_modulus, bits_per_block](Torus x) {
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
|
||||
const auto xm = x % msg_modulus;
|
||||
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
|
||||
return (Torus)((msg_modulus - 1) * sign_bit);
|
||||
},
|
||||
}},
|
||||
allocate_gpu_memory);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
this->last_block = new CudaRadixCiphertextFFI;
|
||||
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
|
||||
@@ -85,24 +85,6 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
|
||||
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
|
||||
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
message_extract_lut_f, gpu_memory_allocated);
|
||||
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
|
||||
for (int index = 0; index < 2 * num_radix_blocks; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
@@ -115,12 +97,18 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
auto active_streams_pred =
|
||||
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
|
||||
predicate_lut->broadcast_lut(active_streams_pred);
|
||||
predicate_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto active_streams_msg =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
message_extract_lut->broadcast_lut(active_streams_msg);
|
||||
|
||||
message_extract_lut->generate_and_broadcast_lut(
|
||||
active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -39,22 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
max_chunks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
auto is_max_value_f = [max_value](Torus x) -> Torus {
|
||||
return x == max_value;
|
||||
};
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
|
||||
is_max_value->get_degree(0), is_max_value->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
|
||||
|
||||
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(max_chunks, params.pbs_type);
|
||||
is_max_value->broadcast_lut(active_streams);
|
||||
|
||||
auto is_max_value_f = [max_value](Torus x) -> Torus {
|
||||
return x == max_value;
|
||||
};
|
||||
|
||||
is_max_value->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -103,15 +102,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
|
||||
is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
is_non_zero_lut->broadcast_lut(active_streams);
|
||||
is_non_zero_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
|
||||
|
||||
// Scalar may have up to num_radix_blocks blocks
|
||||
scalar_comparison_luts = new int_radix_lut<Torus>(
|
||||
@@ -129,32 +123,28 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
return (lhs == rhs);
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
for (int i = 0; i < total_modulus; i++) {
|
||||
auto lut_f = [i, operator_f](Torus x) -> Torus {
|
||||
return operator_f(i, x);
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
scalar_comparison_luts->get_lut(0, i),
|
||||
scalar_comparison_luts->get_degree(i),
|
||||
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f, gpu_memory_allocated);
|
||||
lut_funcs.push_back(lut_f);
|
||||
lut_indices.push_back(i);
|
||||
}
|
||||
scalar_comparison_luts->broadcast_lut(active_streams);
|
||||
|
||||
scalar_comparison_luts->generate_and_broadcast_lut(
|
||||
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
|
||||
|
||||
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
|
||||
operator_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
|
||||
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, operator_f, gpu_memory_allocated);
|
||||
|
||||
operator_lut->broadcast_lut(active_streams);
|
||||
operator_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {operator_f}, gpu_memory_allocated);
|
||||
// operator_lut->broadcast_lut(active_streams);
|
||||
} else {
|
||||
operator_lut = nullptr;
|
||||
}
|
||||
@@ -221,9 +211,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
// LUTs
|
||||
tree_inner_leaf_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
tree_last_leaf_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
@@ -234,15 +221,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
|
||||
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
block_selector_f, gpu_memory_allocated);
|
||||
tree_inner_leaf_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
tree_inner_leaf_lut->broadcast_lut(active_streams);
|
||||
tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -426,12 +412,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
|
||||
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
|
||||
identity_lut->broadcast_lut(active_streams);
|
||||
identity_lut->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
|
||||
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
auto is_zero_f = [total_modulus](Torus x) -> Torus {
|
||||
@@ -441,13 +423,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
|
||||
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_zero_f, gpu_memory_allocated);
|
||||
|
||||
is_zero_lut->broadcast_lut(active_streams);
|
||||
is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
switch (op) {
|
||||
case COMPARISON_TYPE::MAX:
|
||||
@@ -522,13 +499,9 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
|
||||
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
signed_lut->broadcast_lut(active_streams);
|
||||
signed_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
|
||||
}
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
|
||||
|
||||
@@ -283,12 +283,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
zero_out_if_not_1_lut_2};
|
||||
size_t lut_gpu_indexes[2] = {0, 3};
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(lut_gpu_indexes[j]),
|
||||
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
|
||||
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
|
||||
{0}, {zero_out_if_not_1_lut_f},
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
|
||||
luts[0] = zero_out_if_not_2_lut_1;
|
||||
@@ -296,12 +293,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
lut_gpu_indexes[0] = 1;
|
||||
lut_gpu_indexes[1] = 2;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(lut_gpu_indexes[j]),
|
||||
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
|
||||
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
|
||||
{0}, {zero_out_if_not_2_lut_f},
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
|
||||
quotient_lut_1 =
|
||||
@@ -321,21 +315,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
};
|
||||
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
|
||||
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
|
||||
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
|
||||
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
|
||||
quotient_lut_1->generate_and_broadcast_lut(
|
||||
streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
|
||||
quotient_lut_2->generate_and_broadcast_lut(
|
||||
streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
|
||||
quotient_lut_3->generate_and_broadcast_lut(
|
||||
streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
|
||||
|
||||
message_extract_lut_1 = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
@@ -350,15 +335,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
luts[0] = message_extract_lut_1;
|
||||
luts[1] = message_extract_lut_2;
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
luts[j]->broadcast_lut(active_streams);
|
||||
luts[j]->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1007,24 +989,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
masking_luts_2[i] = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
|
||||
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
|
||||
masking_luts_1[i]->broadcast_lut(active_streams_1);
|
||||
masking_luts_1[i]->generate_and_broadcast_lut(
|
||||
active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
|
||||
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_streams_2 =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
masking_luts_2[i]->broadcast_lut(active_streams_2);
|
||||
masking_luts_2[i]->generate_and_broadcast_lut(
|
||||
active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
// create and generate message_extract_lut_1 and message_extract_lut_2
|
||||
@@ -1042,15 +1014,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
|
||||
message_extract_lut_2};
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(active_streams);
|
||||
luts[j]->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
@@ -1141,14 +1110,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
merge_overflow_flags_luts[i]->get_lut(0, 0),
|
||||
merge_overflow_flags_luts[i]->get_degree(0),
|
||||
merge_overflow_flags_luts[i]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
|
||||
merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
|
||||
merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
|
||||
active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1557,16 +1520,12 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
compare_signed_bits_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
compare_signed_bits_lut->get_lut(0, 0),
|
||||
compare_signed_bits_lut->get_degree(0),
|
||||
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
f_compare_extracted_signed_bits, gpu_memory_allocated);
|
||||
auto active_gpu_count_cmp =
|
||||
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
|
||||
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
|
||||
|
||||
compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -53,13 +53,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
return count;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
|
||||
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
|
||||
|
||||
univ_lut_mem->broadcast_lut(active_streams);
|
||||
univ_lut_mem->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
|
||||
|
||||
auto generate_bi_lut_lambda =
|
||||
[num_bits](Torus block_num_bit_count,
|
||||
@@ -70,13 +65,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
return 0;
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
|
||||
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
|
||||
|
||||
biv_lut_mem->broadcast_lut(active_streams);
|
||||
biv_lut_mem->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
|
||||
|
||||
this->tmp_ct = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -232,7 +222,7 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
this->sum_output_not_propagated, counter_num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->lut_message_not =
|
||||
lut_message_not =
|
||||
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus)> lut_message_lambda =
|
||||
@@ -240,16 +230,11 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
uint64_t message = x % this->params.message_modulus;
|
||||
return (~message) % this->params.message_modulus;
|
||||
};
|
||||
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
|
||||
this->lut_message_not->get_lut(0, 0),
|
||||
this->lut_message_not->get_degree(0),
|
||||
this->lut_message_not->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus,
|
||||
lut_message_lambda, allocate_gpu_memory);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
|
||||
lut_message_not->broadcast_lut(active_streams);
|
||||
lut_message_not->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
|
||||
|
||||
this->lut_carry_not =
|
||||
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
|
||||
@@ -259,13 +244,8 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
uint64_t carry = x / this->params.message_modulus;
|
||||
return (~carry) % this->params.message_modulus;
|
||||
};
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
|
||||
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_carry_lambda, allocate_gpu_memory);
|
||||
lut_carry_not->broadcast_lut(active_streams);
|
||||
lut_carry_not->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
|
||||
|
||||
this->message_blocks_not = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <queue>
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -835,6 +836,56 @@ struct int_radix_lut_custom_input_output {
|
||||
}
|
||||
}
|
||||
|
||||
void generate_and_broadcast_lut(
|
||||
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
|
||||
std::vector<std::function<OutputTorus(OutputTorus)>> f,
|
||||
bool gpu_memory_allocated) {
|
||||
// streams should be a subset of active_streams
|
||||
|
||||
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
|
||||
generate_device_accumulator<OutputTorus>(
|
||||
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
|
||||
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, f[i], gpu_memory_allocated);
|
||||
}
|
||||
//broadcast_lut(streams);
|
||||
}
|
||||
|
||||
void generate_and_broadcast_bivariate_lut(
|
||||
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
|
||||
std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
|
||||
bool gpu_memory_allocated) {
|
||||
// streams should be a subset of active_streams
|
||||
|
||||
/* for (int fidx = 0; fidx < f.size(); ++fidx) {
|
||||
__int128_t f_hash = 0;
|
||||
uint32_t bits_per_lut_val = 5;
|
||||
uint32_t input_modulus_sup =
|
||||
params.message_modulus * params.carry_modulus;
|
||||
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
|
||||
OutputTorus f_eval =
|
||||
f[fidx](i / params.message_modulus, i % params.message_modulus);
|
||||
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
|
||||
"LUT value expected bitwidth overflow");
|
||||
f_hash |= f_eval;
|
||||
f_hash <<= bits_per_lut_val;
|
||||
}
|
||||
printf("%016llX%016llX\n",
|
||||
(unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
|
||||
(unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
|
||||
}
|
||||
*/
|
||||
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
|
||||
generate_device_accumulator_bivariate<InputTorus>(
|
||||
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
|
||||
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, f[i], gpu_memory_allocated);
|
||||
}
|
||||
//broadcast_lut(streams);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
|
||||
"Lut vec and Lut vec indexes must have the same size");
|
||||
@@ -985,18 +1036,15 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
bits_per_block * num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
for (int i = 0; i < bits_per_block; i++) {
|
||||
|
||||
auto operator_f = [i, final_offset](Torus x) -> Torus {
|
||||
Torus y = (x >> i) & 1;
|
||||
return y << final_offset;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
|
||||
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
operator_f, gpu_memory_allocated);
|
||||
lut_funs.push_back(operator_f);
|
||||
lut_indices.push_back(i);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1015,7 +1063,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
bits_per_block * num_radix_blocks, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
|
||||
gpu_memory_allocated);
|
||||
// lut->broadcast_lut(active_streams);
|
||||
|
||||
/**
|
||||
* the input indexes should take the first bits_per_block PBS to target
|
||||
@@ -1091,24 +1142,6 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
};
|
||||
|
||||
//
|
||||
Torus *lut_buffer_message = lut->get_lut(0, 0);
|
||||
uint64_t *message_degree = lut->get_degree(0);
|
||||
uint64_t *message_max_degree = lut->get_max_degree(0);
|
||||
Torus *lut_buffer_carry = lut->get_lut(0, 1);
|
||||
uint64_t *carry_degree = lut->get_degree(1);
|
||||
uint64_t *carry_max_degree = lut->get_max_degree(1);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut_buffer_message,
|
||||
message_degree, message_max_degree, params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
|
||||
carry_max_degree, params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f_carry,
|
||||
gpu_memory_allocated);
|
||||
|
||||
uint64_t lwe_indexes_size = 2 * sizeof(Torus);
|
||||
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
||||
@@ -1118,9 +1151,15 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
//
|
||||
// No broadcast is needed because full prop is done on 1 single GPU.
|
||||
// By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
|
||||
// not broadcast.
|
||||
//
|
||||
lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
|
||||
{lut_f_message, lut_f_carry},
|
||||
gpu_memory_allocated);
|
||||
|
||||
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -1238,9 +1277,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
if (total_ciphertexts > 0 ||
|
||||
reduce_degrees_for_single_carry_propagation) {
|
||||
uint64_t size_tracker = 0;
|
||||
allocated_luts_message_carry = true;
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, params, 2, pbs_count, true, size_tracker);
|
||||
allocated_luts_message_carry = true;
|
||||
|
||||
uint64_t message_modulus_bits =
|
||||
(uint64_t)std::log2(params.message_modulus);
|
||||
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
|
||||
@@ -1256,7 +1296,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
streams, upper_bound_num_blocks, size_tracker, true);
|
||||
}
|
||||
}
|
||||
|
||||
if (allocated_luts_message_carry) {
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
@@ -1268,22 +1310,11 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), message_acc,
|
||||
luts_message_carry->get_degree(0),
|
||||
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), carry_acc,
|
||||
luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
auto active_gpu_count_mc =
|
||||
streams.active_gpu_subset(pbs_count, params.pbs_type);
|
||||
luts_message_carry->broadcast_lut(active_gpu_count_mc);
|
||||
luts_message_carry->generate_and_broadcast_lut(
|
||||
active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
@@ -1418,10 +1449,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
uint32_t group_size, uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
grouping_size = group_size;
|
||||
group_resolved_carries = new CudaRadixCiphertextFFI;
|
||||
@@ -1431,22 +1458,20 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
allocate_gpu_memory);
|
||||
|
||||
int num_seq_luts = grouping_size - 1;
|
||||
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
||||
lut_sequential_algorithm =
|
||||
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_indices;
|
||||
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
||||
|
||||
for (int index = 0; index < num_seq_luts; index++) {
|
||||
auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
|
||||
return (propa_cum_sum_block >> (index + 1)) & 1;
|
||||
};
|
||||
auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), seq_lut,
|
||||
lut_sequential_algorithm->get_degree(index),
|
||||
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
|
||||
gpu_memory_allocated);
|
||||
lut_funcs.push_back(f_lut_sequential);
|
||||
h_seq_lut_indexes[index] = index;
|
||||
lut_indices.push_back(index);
|
||||
}
|
||||
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1454,9 +1479,12 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
|
||||
lut_sequential_algorithm->broadcast_lut(active_streams);
|
||||
lut_sequential_algorithm->generate_and_broadcast_lut(
|
||||
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
|
||||
// lut_sequential_algorithm->broadcast_lut(active_streams);
|
||||
free(h_seq_lut_indexes);
|
||||
};
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
group_resolved_carries,
|
||||
@@ -1478,10 +1506,6 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
uint32_t num_groups, uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
|
||||
if (msb == 2) {
|
||||
@@ -1501,16 +1525,11 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
lut_hillis_steele = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
|
||||
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_lut_hillis_steele,
|
||||
gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_groups, params.pbs_type);
|
||||
lut_hillis_steele->broadcast_lut(active_streams);
|
||||
};
|
||||
lut_hillis_steele->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
|
||||
}
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
lut_hillis_steele->release(streams);
|
||||
@@ -1800,112 +1819,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
num_extra_luts = 1;
|
||||
}
|
||||
|
||||
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
|
||||
luts_array_second_step = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_second_step, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// luts for first group inner propagation
|
||||
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
|
||||
auto f_first_grouping_inner_propagation =
|
||||
[lut_id](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
|
||||
|
||||
if (carry != 0) {
|
||||
return 2ull; // Generates Carry
|
||||
} else {
|
||||
return 0ull; // Does not generate carry
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_first_grouping_inner_propagation, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
auto f_first_grouping_outer_propagation =
|
||||
[num_bits_in_block](Torus block) -> Torus {
|
||||
return (block >> (num_bits_in_block - 1)) & 1;
|
||||
};
|
||||
|
||||
int lut_id = grouping_size - 1;
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_first_grouping_outer_propagation, gpu_memory_allocated);
|
||||
|
||||
// for other groupings inner propagation
|
||||
for (int index = 0; index < grouping_size; index++) {
|
||||
uint32_t lut_id = index + grouping_size;
|
||||
|
||||
auto f_other_groupings_inner_propagation =
|
||||
[index](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t mask = (2 << index) - 1;
|
||||
if (propa_cum_sum_block >= (2 << index)) {
|
||||
return 2ull; // Generates
|
||||
} else if ((propa_cum_sum_block & mask) == mask) {
|
||||
return 1ull; // Propagate
|
||||
} else {
|
||||
return 0ull; // Nothing
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_other_groupings_inner_propagation, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
for (int index = 0; index < grouping_size - 1; index++) {
|
||||
uint32_t lut_id = index + 2 * grouping_size;
|
||||
|
||||
auto f_group_propagation = [index, block_modulus,
|
||||
num_bits_in_block](Torus block) -> Torus {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 0ull;
|
||||
} else {
|
||||
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_group_propagation, gpu_memory_allocated);
|
||||
}
|
||||
} else {
|
||||
uint32_t lut_id = 2 * grouping_size;
|
||||
auto f_group_propagation = [block_modulus](Torus block) {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 2ull;
|
||||
} else {
|
||||
return UINT64_MAX % (block_modulus * 2ull);
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array_second_step->get_lut(0, lut_id),
|
||||
luts_array_second_step->get_degree(lut_id),
|
||||
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
|
||||
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
|
||||
|
||||
for (int index = 0; index < num_radix_blocks; index++) {
|
||||
@@ -1941,6 +1854,11 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
|
||||
luts_array_second_step = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_second_step, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// copy the indexes to the gpu
|
||||
Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
@@ -1951,9 +1869,92 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
scalar_array_cum_sum, h_scalar_array_cum_sum,
|
||||
num_radix_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
std::vector<std::function<Torus(Torus)>> lut_funcs;
|
||||
std::vector<uint32_t> lut_ids;
|
||||
|
||||
// luts for first group inner propagation
|
||||
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
|
||||
auto f_first_grouping_inner_propagation =
|
||||
[lut_id](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
|
||||
|
||||
if (carry != 0) {
|
||||
return 2ull; // Generates Carry
|
||||
} else {
|
||||
return 0ull; // Does not generate carry
|
||||
}
|
||||
};
|
||||
lut_funcs.push_back(f_first_grouping_inner_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
|
||||
auto f_first_grouping_outer_propagation =
|
||||
[num_bits_in_block](Torus block) -> Torus {
|
||||
return (block >> (num_bits_in_block - 1)) & 1;
|
||||
};
|
||||
|
||||
int lut_id = grouping_size - 1;
|
||||
|
||||
lut_funcs.push_back(f_first_grouping_outer_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
|
||||
// for other groupings inner propagation
|
||||
for (int index = 0; index < grouping_size; index++) {
|
||||
uint32_t lut_id = index + grouping_size;
|
||||
|
||||
auto f_other_groupings_inner_propagation =
|
||||
[index](Torus propa_cum_sum_block) -> Torus {
|
||||
uint64_t mask = (2 << index) - 1;
|
||||
if (propa_cum_sum_block >= (2 << index)) {
|
||||
return 2ull; // Generates
|
||||
} else if ((propa_cum_sum_block & mask) == mask) {
|
||||
return 1ull; // Propagate
|
||||
} else {
|
||||
return 0ull; // Nothing
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(f_other_groupings_inner_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
for (int index = 0; index < grouping_size - 1; index++) {
|
||||
uint32_t lut_id = index + 2 * grouping_size;
|
||||
|
||||
auto f_group_propagation = [index, block_modulus,
|
||||
num_bits_in_block](Torus block) -> Torus {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 0ull;
|
||||
} else {
|
||||
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(f_group_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
} else {
|
||||
uint32_t lut_id = 2 * grouping_size;
|
||||
auto f_group_propagation = [block_modulus](Torus block) {
|
||||
if (block == (block_modulus - 1)) {
|
||||
return 2ull;
|
||||
} else {
|
||||
return UINT64_MAX % (block_modulus * 2ull);
|
||||
}
|
||||
};
|
||||
|
||||
lut_funcs.push_back(f_group_propagation);
|
||||
lut_ids.push_back(lut_id);
|
||||
}
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array_second_step->broadcast_lut(active_streams);
|
||||
luts_array_second_step->generate_and_broadcast_lut(
|
||||
active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
|
||||
|
||||
// luts_array_second_step->broadcast_lut(active_streams);
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
|
||||
@@ -2041,12 +2042,28 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
uint32_t requested_flag;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
void setup_message_extract_indices_for_carry_async(CudaStreams streams,
|
||||
uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
}
|
||||
|
||||
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
@@ -2069,24 +2086,6 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
streams, params, num_radix_blocks, grouping_size, num_groups,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
// Step 3 elements
|
||||
int num_luts_message_extract =
|
||||
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
|
||||
lut_message_extract = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_message_extract, num_radix_blocks + 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
// lut for the first block in the first grouping
|
||||
auto f_message_extract = [message_modulus](Torus block) -> Torus {
|
||||
return (block >> 1) % message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
|
||||
// This store a single block that with be used to store the overflow or
|
||||
// carry results
|
||||
output_flag = new CudaRadixCiphertextFFI;
|
||||
@@ -2137,22 +2136,30 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
return output1 << 3 | output2 << 2;
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_overflow_flag_prep->get_lut(0, 0),
|
||||
lut_overflow_flag_prep->get_degree(0),
|
||||
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
lut_overflow_flag_prep->broadcast_lut(active_streams);
|
||||
lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
// Step 3 elements
|
||||
int num_luts_message_extract =
|
||||
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
|
||||
lut_message_extract = new int_radix_lut<Torus>(
|
||||
streams, params, num_luts_message_extract, num_radix_blocks + 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
// lut for the first block in the first grouping
|
||||
auto f_message_extract = [message_modulus](Torus block) -> Torus {
|
||||
return (block >> 1) % message_modulus;
|
||||
};
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
|
||||
|
||||
// For the final cleanup in case of overflow or carry (it seems that I can)
|
||||
// It seems that this lut could be apply together with the other one but for
|
||||
// now we won't do it
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
|
||||
switch (requested_flag) {
|
||||
case outputFlag::FLAG_OVERFLOW: { // Overflow case
|
||||
auto f_overflow_last = [num_radix_blocks,
|
||||
requested_flag_in](Torus block) -> Torus {
|
||||
uint32_t position = (num_radix_blocks == 1 &&
|
||||
@@ -2164,62 +2171,38 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
|
||||
if (input_carry == outputFlag::FLAG_OVERFLOW) {
|
||||
return does_overflow_if_carry_is_1;
|
||||
} else {
|
||||
return does_overflow_if_carry_is_0;
|
||||
}
|
||||
return does_overflow_if_carry_is_0;
|
||||
};
|
||||
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 1),
|
||||
lut_message_extract->get_degree(1),
|
||||
lut_message_extract->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1}, {f_message_extract, f_overflow_last},
|
||||
gpu_memory_allocated);
|
||||
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
|
||||
case outputFlag::FLAG_CARRY: { // Carry case
|
||||
|
||||
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
|
||||
allocate_gpu_memory);
|
||||
|
||||
auto f_carry_last = [](Torus block) -> Torus {
|
||||
return ((block >> 2) & 1);
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 1),
|
||||
lut_message_extract->get_degree(1),
|
||||
lut_message_extract->get_max_degree(1), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f_carry_last,
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1}, {f_message_extract, f_carry_last},
|
||||
gpu_memory_allocated);
|
||||
|
||||
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
||||
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
||||
if (index < num_radix_blocks) {
|
||||
h_lut_indexes[index] = 0;
|
||||
} else {
|
||||
h_lut_indexes[index] = 1;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
break;
|
||||
}
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
|
||||
lut_message_extract->broadcast_lut(active_streams);
|
||||
default:
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
|
||||
break;
|
||||
}
|
||||
|
||||
// lut_message_extract->broadcast_lut(active_streams);
|
||||
};
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -2517,16 +2500,11 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
return (block >> 1) % message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
lut_message_extract->broadcast_lut(active_streams);
|
||||
lut_message_extract->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
|
||||
|
||||
if (compute_overflow) {
|
||||
lut_borrow_flag =
|
||||
@@ -2537,12 +2515,8 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
return ((block >> 2) & 1);
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
|
||||
lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
|
||||
lut_borrow_flag->broadcast_lut(active_streams);
|
||||
lut_borrow_flag->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
active_streams =
|
||||
|
||||
@@ -37,17 +37,14 @@ template <typename Torus> struct int_mul_memory {
|
||||
zero_out_predicate_lut =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
zero_out_predicate_lut->get_lut(0, 0),
|
||||
zero_out_predicate_lut->get_degree(0),
|
||||
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
zero_out_predicate_lut->broadcast_lut(active_streams);
|
||||
zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {zero_out_predicate_lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
// zero_out_predicate_lut->broadcast_lut(active_streams);
|
||||
|
||||
zero_out_mem = new int_zero_out_if_buffer<Torus>(
|
||||
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
@@ -55,10 +52,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
return;
|
||||
}
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto carry_modulus = params.carry_modulus;
|
||||
|
||||
// 'vector_result_lsb' contains blocks from all possible shifts of
|
||||
// radix_lwe_left excluding zero ciphertext blocks
|
||||
@@ -102,18 +96,6 @@ template <typename Torus> struct int_mul_memory {
|
||||
return (x * y) / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lsb_acc,
|
||||
luts_array->get_degree(0), luts_array->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_lsb, gpu_memory_allocated);
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), msb_acc,
|
||||
luts_array->get_degree(1), luts_array->get_max_degree(1),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_msb, gpu_memory_allocated);
|
||||
|
||||
// lut_indexes_vec for luts_array should be reinitialized
|
||||
// first lsb_vector_block_count value should reference to lsb_acc
|
||||
// last msb_vector_block_count values should reference to msb_acc
|
||||
@@ -123,9 +105,12 @@ template <typename Torus> struct int_mul_memory {
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
|
||||
msb_vector_block_count);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(total_block_count, params.pbs_type);
|
||||
luts_array->broadcast_lut(active_streams);
|
||||
luts_array->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
|
||||
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, params, num_radix_blocks, 2 * num_radix_blocks,
|
||||
|
||||
@@ -85,15 +85,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
}
|
||||
|
||||
// right shift
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cur_lut_bivariate->broadcast_lut(active_streams);
|
||||
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
@@ -172,16 +168,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
}
|
||||
|
||||
// right shift
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cur_lut_bivariate->broadcast_lut(active_streams);
|
||||
|
||||
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
}
|
||||
@@ -271,16 +261,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
return shifted | padding;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
shift_last_block_lut_univariate->get_lut(0, 0),
|
||||
shift_last_block_lut_univariate->get_degree(0),
|
||||
shift_last_block_lut_univariate->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
|
||||
auto active_streams_shift_last =
|
||||
streams.active_gpu_subset(1, params.pbs_type);
|
||||
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
|
||||
shift_last_block_lut_univariate->generate_and_broadcast_lut(
|
||||
active_streams_shift_last, {0}, {last_block_lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
|
||||
}
|
||||
@@ -298,15 +283,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
return (params.message_modulus - 1) * x_sign_bit;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
padding_block_lut_univariate->get_lut(0, 0),
|
||||
padding_block_lut_univariate->get_degree(0),
|
||||
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
padding_block_lut_f, gpu_memory_allocated);
|
||||
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
padding_block_lut_univariate->broadcast_lut(active_streams);
|
||||
padding_block_lut_univariate->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
|
||||
|
||||
lut_buffers_univariate.push_back(padding_block_lut_univariate);
|
||||
|
||||
@@ -339,16 +317,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
return message_of_current_block + carry_of_previous_block;
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
shift_blocks_lut_bivariate->get_lut(0, 0),
|
||||
shift_blocks_lut_bivariate->get_degree(0),
|
||||
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
blocks_lut_f, gpu_memory_allocated);
|
||||
auto active_streams_shift_blocks =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
|
||||
shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_shift_blocks, {0}, {blocks_lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
|
||||
}
|
||||
|
||||
@@ -113,27 +113,21 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
else
|
||||
return current_bit;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
|
||||
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
|
||||
;
|
||||
auto active_gpu_count_mux = streams.active_gpu_subset(
|
||||
bits_per_block * num_radix_blocks, params.pbs_type);
|
||||
mux_lut->broadcast_lut(active_gpu_count_mux);
|
||||
|
||||
mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto cleaning_lut_f = [params](Torus x) -> Torus {
|
||||
return x % params.message_modulus;
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
|
||||
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_gpu_count_cleaning =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
|
||||
cleaning_lut->generate_and_broadcast_lut(
|
||||
active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -74,45 +74,26 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
luts_array, size_tracker,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
|
||||
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
|
||||
|
||||
// generate luts (aka accumulators)
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
|
||||
luts_array->get_degree(0), luts_array->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_lut_does_block_generate_carry, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
lut_does_block_generate_or_propagate, luts_array->get_degree(1),
|
||||
luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
|
||||
gpu_memory_allocated);
|
||||
if (allocate_gpu_memory)
|
||||
cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
luts_array->get_lut_indexes(0, 1), 1,
|
||||
num_radix_blocks - 1);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_borrow_propagation_sum->get_lut(0, 0),
|
||||
luts_borrow_propagation_sum->get_degree(0),
|
||||
luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
f_luts_borrow_propagation_sum, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
|
||||
message_acc->get_degree(0), message_acc->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_message_acc, gpu_memory_allocated);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array->broadcast_lut(active_streams);
|
||||
luts_borrow_propagation_sum->broadcast_lut(active_streams);
|
||||
message_acc->broadcast_lut(active_streams);
|
||||
luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
|
||||
active_streams, {0}, {f_luts_borrow_propagation_sum},
|
||||
gpu_memory_allocated);
|
||||
|
||||
luts_array->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1},
|
||||
{f_lut_does_block_generate_carry,
|
||||
f_lut_does_block_generate_or_propagate},
|
||||
gpu_memory_allocated);
|
||||
// generate luts (aka accumulators)
|
||||
|
||||
message_acc->generate_and_broadcast_lut(
|
||||
active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -298,14 +298,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
||||
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
id_fn, allocate_gpu_memory);
|
||||
lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
|
||||
allocate_gpu_memory);
|
||||
|
||||
lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
this->stream_identity_luts[i] = lut;
|
||||
}
|
||||
|
||||
@@ -318,27 +314,17 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
|
||||
this->message_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->message_extract_lut->get_lut(0, 0),
|
||||
this->message_extract_lut->get_degree(0),
|
||||
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
msg_fn, allocate_gpu_memory);
|
||||
this->message_extract_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
|
||||
this->message_extract_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->carry_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->carry_extract_lut->get_lut(0, 0),
|
||||
this->carry_extract_lut->get_degree(0),
|
||||
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
carry_fn, allocate_gpu_memory);
|
||||
this->carry_extract_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
|
||||
this->carry_extract_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->partial_aggregated_vectors =
|
||||
new CudaRadixCiphertextFFI *[num_streams];
|
||||
@@ -1185,15 +1171,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
this->prefix_sum_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->prefix_sum_lut->get_lut(0, 0),
|
||||
this->prefix_sum_lut->get_degree(0),
|
||||
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
prefix_sum_fn, allocate_gpu_memory);
|
||||
this->prefix_sum_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{prefix_sum_fn}, allocate_gpu_memory);
|
||||
|
||||
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
||||
Torus val = x % params.message_modulus;
|
||||
@@ -1203,14 +1183,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
};
|
||||
this->cleanup_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
|
||||
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleanup_fn, allocate_gpu_memory);
|
||||
this->cleanup_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
this->cleanup_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{cleanup_fn}, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -1376,15 +1351,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
this->prefix_sum_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->prefix_sum_lut->get_lut(0, 0),
|
||||
this->prefix_sum_lut->get_degree(0),
|
||||
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
prefix_sum_fn, allocate_gpu_memory);
|
||||
this->prefix_sum_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{prefix_sum_fn}, allocate_gpu_memory);
|
||||
|
||||
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
||||
Torus val = x % params.message_modulus;
|
||||
@@ -1394,14 +1363,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
};
|
||||
this->cleanup_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
|
||||
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleanup_fn, allocate_gpu_memory);
|
||||
this->cleanup_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
this->cleanup_lut->generate_and_broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
|
||||
{cleanup_fn}, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
|
||||
@@ -30,15 +30,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
|
||||
std::function<Torus(Torus, Torus)> and_lambda =
|
||||
[](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
|
||||
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, and_lambda, allocate_gpu_memory);
|
||||
|
||||
auto active_streams_and =
|
||||
streams.active_gpu_subset(total_lut_ops, params.pbs_type);
|
||||
this->and_lut->broadcast_lut(active_streams_and);
|
||||
this->and_lut->generate_and_broadcast_bivariate_lut(
|
||||
active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
|
||||
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
|
||||
@@ -50,15 +45,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
|
||||
return x & 1;
|
||||
};
|
||||
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
|
||||
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, flush_lambda, allocate_gpu_memory);
|
||||
|
||||
auto active_streams_flush =
|
||||
streams.active_gpu_subset(total_flush_ops, params.pbs_type);
|
||||
this->flush_lut->broadcast_lut(active_streams_flush);
|
||||
this->flush_lut->generate_and_broadcast_lut(
|
||||
active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
|
||||
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
}
|
||||
|
||||
|
||||
@@ -174,40 +174,6 @@ template <typename Torus> struct zk_expand_mem {
|
||||
message_and_carry_extract_luts = new int_radix_lut<Torus>(
|
||||
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 0),
|
||||
message_and_carry_extract_luts->get_degree(0),
|
||||
message_and_carry_extract_luts->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 1),
|
||||
message_and_carry_extract_luts->get_degree(1),
|
||||
message_and_carry_extract_luts->get_max_degree(1),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 2),
|
||||
message_and_carry_extract_luts->get_degree(2),
|
||||
message_and_carry_extract_luts->get_max_degree(2),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
|
||||
gpu_memory_allocated);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
message_and_carry_extract_luts->get_lut(0, 3),
|
||||
message_and_carry_extract_luts->get_degree(3),
|
||||
message_and_carry_extract_luts->get_max_degree(3),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
|
||||
gpu_memory_allocated);
|
||||
|
||||
// We are always packing two LWEs. We just need to be sure we have enough
|
||||
// space in the carry part to store a message of the same size as is in the
|
||||
// message part.
|
||||
@@ -292,7 +258,13 @@ template <typename Torus> struct zk_expand_mem {
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
message_and_carry_extract_luts->broadcast_lut(active_streams);
|
||||
|
||||
message_and_carry_extract_luts->generate_and_broadcast_lut(
|
||||
active_streams, {0, 1, 2, 3},
|
||||
{message_extract_lut_f, carry_extract_lut_f,
|
||||
message_extract_and_sanitize_bool_lut_f,
|
||||
carry_extract_and_sanitize_bool_lut_f},
|
||||
gpu_memory_allocated);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
|
||||
|
||||
@@ -1067,6 +1067,85 @@ void generate_device_accumulator_bivariate(
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus> struct int_lut_cache {
|
||||
int_lut_cache() {}
|
||||
|
||||
Torus *get_cached_univariate_lut(std::function<Torus(Torus)> &f, uint64_t *degree,
|
||||
uint64_t *max_degree, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t input_message_modulus,
|
||||
uint32_t input_carry_modulus,
|
||||
uint32_t output_message_modulus,
|
||||
uint32_t output_carry_modulus) {
|
||||
/*__int128_t f_hash = 0;
|
||||
uint32_t bits_per_lut_val = 5;
|
||||
uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
|
||||
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
|
||||
Torus f_eval = f(i);
|
||||
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
|
||||
"LUT value expected bitwidth overflow");
|
||||
f_hash |= f_eval;
|
||||
f_hash <<= bits_per_lut_val;
|
||||
}
|
||||
|
||||
std::lock_guard cache_lock(_mutex);
|
||||
if (_lut_cache.find(f_hash) != _lut_cache.end()) {
|
||||
lut_ptr &ptr = _lut_cache[f_hash];
|
||||
GPU_ASSERT(ptr.output_message_modulus == output_message_modulus,
|
||||
"Error modulus");
|
||||
GPU_ASSERT(ptr.input_message_modulus == input_message_modulus,
|
||||
"Error modulus");
|
||||
GPU_ASSERT(ptr.glwe_dimension == glwe_dimension, "Error modulus");
|
||||
*max_degree = ptr.max_degree;
|
||||
*degree = ptr.degree;
|
||||
return ptr.ptr;
|
||||
}*/
|
||||
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
*max_degree = input_message_modulus * input_carry_modulus - 1;
|
||||
*degree = generate_lookup_table_with_encoding<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
|
||||
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
|
||||
|
||||
/*lut_ptr new_ptr = {h_lut,
|
||||
glwe_dimension,
|
||||
input_message_modulus,
|
||||
input_carry_modulus,
|
||||
output_message_modulus,
|
||||
output_carry_modulus,
|
||||
*max_degree,
|
||||
*degree};*/
|
||||
//_lut_cache[f_hash] = new_ptr;
|
||||
return h_lut;
|
||||
}
|
||||
|
||||
~int_lut_cache() {
|
||||
std::lock_guard cache_lock(_mutex);
|
||||
for (auto v : _lut_cache) {
|
||||
free(v.second.ptr);
|
||||
}
|
||||
_lut_cache.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
struct lut_ptr {
|
||||
Torus *ptr;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t input_message_modulus;
|
||||
uint32_t input_carry_modulus;
|
||||
uint32_t output_message_modulus;
|
||||
uint32_t output_carry_modulus;
|
||||
uint64_t max_degree;
|
||||
uint64_t degree;
|
||||
};
|
||||
std::map<__int128_t, lut_ptr> _lut_cache;
|
||||
std::mutex _mutex;
|
||||
};
|
||||
static int_lut_cache<uint64_t> g_LutCache64;
|
||||
|
||||
/*
|
||||
* generate bivariate accumulator with factor scaling for device pointer
|
||||
* v_stream - cuda stream
|
||||
@@ -1098,8 +1177,8 @@ void generate_device_accumulator_bivariate_with_factor(
|
||||
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
// cuda_synchronize_stream(stream, gpu_index);
|
||||
// free(h_lut);
|
||||
}
|
||||
/*
|
||||
* generate bivariate accumulator for device pointer
|
||||
@@ -1145,23 +1224,36 @@ void generate_device_accumulator_with_encoding(
|
||||
uint32_t output_message_modulus, uint32_t output_carry_modulus,
|
||||
std::function<Torus(Torus)> f, bool gpu_memory_allocated) {
|
||||
|
||||
static constexpr auto is_u64 = std::is_same_v<Torus, uint64_t>;
|
||||
Torus *h_lut = nullptr;
|
||||
// host lut
|
||||
Torus *h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
*max_degree = input_message_modulus * input_carry_modulus - 1;
|
||||
// fill accumulator
|
||||
*degree = generate_lookup_table_with_encoding<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
|
||||
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
|
||||
if constexpr (is_u64) {
|
||||
h_lut = g_LutCache64.get_cached_univariate_lut(
|
||||
f, degree, max_degree, glwe_dimension, polynomial_size,
|
||||
input_message_modulus, input_carry_modulus, output_message_modulus,
|
||||
output_carry_modulus);
|
||||
} else {
|
||||
h_lut =
|
||||
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
|
||||
|
||||
*max_degree = input_message_modulus * input_carry_modulus - 1;
|
||||
// fill accumulator
|
||||
*degree = generate_lookup_table_with_encoding<Torus>(
|
||||
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
|
||||
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
|
||||
}
|
||||
/*
|
||||
// copy host lut and lut_indexes_vec to device
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
*/
|
||||
if (!std::is_same_v<Torus, uint64_t>) {
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void generate_device_accumulator_with_encoding_with_cpu_prealloc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
||||
@@ -1264,8 +1356,8 @@ void generate_many_lut_device_accumulator(
|
||||
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index, gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
free(h_lut);
|
||||
//cuda_synchronize_stream(stream, gpu_index);
|
||||
//free(h_lut);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user