Compare commits

...

15 Commits

Author SHA1 Message Date
Andrei Stoian
b4ea48165b fix(gpu): disable cache 2026-01-23 15:50:37 +01:00
Andrei Stoian
0a6b62627d fix(gpu): remove broadcast 2026-01-23 13:00:44 +01:00
Andrei Stoian
6deeb66bf8 fix(gpu): test remove sync on lut create 2026-01-23 11:27:47 +01:00
Andrei Stoian
17022dae69 feat(gpu): lut cache univariate 2026-01-22 16:56:14 +01:00
Andrei Stoian
09802dd5ee feat(gpu): lut cache 2026-01-22 10:04:39 +01:00
Andrei Stoian
e3fe433a35 fix(gpu): univariate fix 2026-01-21 17:24:16 +01:00
Andrei Stoian
2bea35a3b5 fix(gpu): finish bivariate 2026-01-21 16:13:21 +01:00
Andrei Stoian
e2bf226276 fix(gpu): start bivariate, fix all univariate 2026-01-21 15:24:51 +01:00
Andrei Stoian
c66f1c6d8b fix(gpu): all univariate luts 2026-01-21 12:06:21 +01:00
Andrei Stoian
9bfe190ad3 fix(gpu): sc prop fix 2026-01-21 11:48:21 +01:00
Andrei Stoian
e40070db0e fix(gpu): sc prop encapsulate lut 2026-01-21 10:35:21 +01:00
Andrei Stoian
e8d5ceac68 fix(gpu): more lut encaps 2026-01-20 15:43:24 +01:00
Andrei Stoian
f1526b29d8 fix(gpu): more lut 2026-01-19 17:55:45 +01:00
Andrei Stoian
602e0c5a19 fix(gpu): more lut encaps 2026-01-19 15:13:07 +01:00
Andrei Stoian
163c1eeffb chore(gpu): refactor lut generation 2026-01-16 16:02:17 +01:00
16 changed files with 531 additions and 732 deletions

View File

@@ -29,15 +29,13 @@ template <typename Torus> struct int_aes_lut_buffers {
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return a & b; };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and_lut = streams.active_gpu_subset(
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and_lut);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
@@ -46,14 +44,11 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush_lut = streams.active_gpu_subset(
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush_lut);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->carry_lut = new int_radix_lut<Torus>(
@@ -61,14 +56,11 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
return (x >> 1) & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_lambda, allocate_gpu_memory);
auto active_streams_carry_lut =
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
this->carry_lut->broadcast_lut(active_streams_carry_lut);
this->carry_lut->generate_and_broadcast_lut(
active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -65,14 +65,8 @@ template <typename Torus> struct boolean_bitop_buffer {
return x % params.message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
message_extract_lut->broadcast_lut(active_streams);
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
}
tmp_lwe_left = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -142,12 +136,8 @@ template <typename Torus> struct int_bitop_buffer {
}
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(active_streams);
lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
}
break;
default:
@@ -156,6 +146,8 @@ template <typename Torus> struct int_bitop_buffer {
num_radix_blocks, allocate_gpu_memory,
size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
@@ -171,14 +163,13 @@ template <typename Torus> struct int_bitop_buffer {
return x ^ rhs;
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(active_streams);
lut_funcs.push_back(lut_univariate_scalar_f);
lut_indices.push_back(i);
}
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
gpu_memory_allocated);
}
}
@@ -211,16 +202,11 @@ template <typename Torus> struct boolean_bitnot_buffer {
return x % message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
message_extract_lut->broadcast_lut(active_streams);
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
}
}

View File

@@ -28,21 +28,17 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
uint32_t bits_per_block = std::log2(params.message_modulus);
uint32_t msg_modulus = params.message_modulus;
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
[msg_modulus, bits_per_block](Torus x) {
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->generate_and_broadcast_lut(
active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
const auto xm = x % msg_modulus;
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
return (Torus)((msg_modulus - 1) * sign_bit);
},
}},
allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->broadcast_lut(active_streams);
this->last_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -85,24 +85,6 @@ template <typename Torus> struct int_cmux_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
message_extract_lut_f, gpu_memory_allocated);
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
@@ -115,12 +97,18 @@ template <typename Torus> struct int_cmux_buffer {
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams_pred =
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
predicate_lut->broadcast_lut(active_streams_pred);
predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
gpu_memory_allocated);
auto active_streams_msg =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
message_extract_lut->broadcast_lut(active_streams_msg);
message_extract_lut->generate_and_broadcast_lut(
active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
}
void release(CudaStreams streams) {

View File

@@ -39,22 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
max_chunks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
is_max_value->get_degree(0), is_max_value->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(max_chunks, params.pbs_type);
is_max_value->broadcast_lut(active_streams);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
is_max_value->generate_and_broadcast_lut(
active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
}
void release(CudaStreams streams) {
@@ -103,15 +102,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
is_non_zero_lut->broadcast_lut(active_streams);
is_non_zero_lut->generate_and_broadcast_lut(
active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -129,32 +123,28 @@ template <typename Torus> struct int_comparison_eq_buffer {
return (lhs == rhs);
}
};
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
scalar_comparison_luts->get_lut(0, i),
scalar_comparison_luts->get_degree(i),
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
lut_funcs.push_back(lut_f);
lut_indices.push_back(i);
}
scalar_comparison_luts->broadcast_lut(active_streams);
scalar_comparison_luts->generate_and_broadcast_lut(
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);
operator_lut->broadcast_lut(active_streams);
operator_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {operator_f}, gpu_memory_allocated);
// operator_lut->broadcast_lut(active_streams);
} else {
operator_lut = nullptr;
}
@@ -221,9 +211,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// LUTs
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
tree_last_leaf_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -234,15 +221,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
tree_inner_leaf_lut->broadcast_lut(active_streams);
tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
}
void release(CudaStreams streams) {
@@ -426,12 +412,8 @@ template <typename Torus> struct int_comparison_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
identity_lut->broadcast_lut(active_streams);
identity_lut->generate_and_broadcast_lut(
active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -441,13 +423,8 @@ template <typename Torus> struct int_comparison_buffer {
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
is_zero_lut->broadcast_lut(active_streams);
is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
gpu_memory_allocated);
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -522,13 +499,9 @@ template <typename Torus> struct int_comparison_buffer {
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
signed_lut->broadcast_lut(active_streams);
signed_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
}
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));

View File

@@ -283,12 +283,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
zero_out_if_not_1_lut_2};
size_t lut_gpu_indexes[2] = {0, 3};
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_1_lut_f},
gpu_memory_allocated);
}
luts[0] = zero_out_if_not_2_lut_1;
@@ -296,12 +293,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
lut_gpu_indexes[0] = 1;
lut_gpu_indexes[1] = 2;
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_2_lut_f},
gpu_memory_allocated);
}
quotient_lut_1 =
@@ -321,21 +315,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
};
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
generate_device_accumulator<Torus>(
streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
quotient_lut_1->generate_and_broadcast_lut(
streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
quotient_lut_2->generate_and_broadcast_lut(
streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
quotient_lut_3->generate_and_broadcast_lut(
streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
message_extract_lut_1 = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -350,15 +335,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
luts[0] = message_extract_lut_1;
luts[1] = message_extract_lut_2;
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
luts[j]->broadcast_lut(active_streams);
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
}
}
@@ -1007,24 +989,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
masking_luts_2[i] = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
masking_luts_1[i]->broadcast_lut(active_streams_1);
masking_luts_1[i]->generate_and_broadcast_lut(
active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_2 =
streams.active_gpu_subset(num_blocks, params.pbs_type);
masking_luts_2[i]->broadcast_lut(active_streams_2);
masking_luts_2[i]->generate_and_broadcast_lut(
active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
}
// create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1042,15 +1014,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
luts[j]->broadcast_lut(active_streams);
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
}
// Give name to closures to improve readability
@@ -1141,14 +1110,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
merge_overflow_flags_luts[i]->get_lut(0, 0),
merge_overflow_flags_luts[i]->get_degree(0),
merge_overflow_flags_luts[i]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
}
}
@@ -1557,16 +1520,12 @@ template <typename Torus> struct int_div_rem_memory {
compare_signed_bits_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
compare_signed_bits_lut->get_lut(0, 0),
compare_signed_bits_lut->get_degree(0),
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
auto active_gpu_count_cmp =
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
gpu_memory_allocated);
}
}

View File

@@ -53,13 +53,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return count;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
univ_lut_mem->broadcast_lut(active_streams);
univ_lut_mem->generate_and_broadcast_lut(
active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
auto generate_bi_lut_lambda =
[num_bits](Torus block_num_bit_count,
@@ -70,13 +65,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return 0;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
biv_lut_mem->broadcast_lut(active_streams);
biv_lut_mem->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
this->tmp_ct = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -232,7 +222,7 @@ template <typename Torus> struct int_ilog2_buffer {
this->sum_output_not_propagated, counter_num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->lut_message_not =
lut_message_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> lut_message_lambda =
@@ -240,16 +230,11 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t message = x % this->params.message_modulus;
return (~message) % this->params.message_modulus;
};
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
this->lut_message_not->get_lut(0, 0),
this->lut_message_not->get_degree(0),
this->lut_message_not->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus,
lut_message_lambda, allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
lut_message_not->broadcast_lut(active_streams);
lut_message_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
this->lut_carry_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -259,13 +244,8 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t carry = x / this->params.message_modulus;
return (~carry) % this->params.message_modulus;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0),
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_carry_lambda, allocate_gpu_memory);
lut_carry_not->broadcast_lut(active_streams);
lut_carry_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
this->message_blocks_not = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -9,6 +9,7 @@
#include "utils/helper_multi_gpu.cuh"
#include <cmath>
#include <functional>
#include <map>
#include <queue>
#include <stdio.h>
@@ -835,6 +836,56 @@ struct int_radix_lut_custom_input_output {
}
}
void generate_and_broadcast_lut(
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
std::vector<std::function<OutputTorus(OutputTorus)>> f,
bool gpu_memory_allocated) {
// streams should be a subset of active_streams
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
generate_device_accumulator<OutputTorus>(
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, f[i], gpu_memory_allocated);
}
//broadcast_lut(streams);
}
void generate_and_broadcast_bivariate_lut(
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
bool gpu_memory_allocated) {
// streams should be a subset of active_streams
/* for (int fidx = 0; fidx < f.size(); ++fidx) {
__int128_t f_hash = 0;
uint32_t bits_per_lut_val = 5;
uint32_t input_modulus_sup =
params.message_modulus * params.carry_modulus;
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
OutputTorus f_eval =
f[fidx](i / params.message_modulus, i % params.message_modulus);
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
"LUT value expected bitwidth overflow");
f_hash |= f_eval;
f_hash <<= bits_per_lut_val;
}
printf("%016llX%016llX\n",
(unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
(unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
}
*/
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
generate_device_accumulator_bivariate<InputTorus>(
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, f[i], gpu_memory_allocated);
}
//broadcast_lut(streams);
}
void release(CudaStreams streams) {
PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
"Lut vec and Lut vec indexes must have the same size");
@@ -985,18 +1036,15 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
bits_per_block * num_radix_blocks,
allocate_gpu_memory, size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < bits_per_block; i++) {
auto operator_f = [i, final_offset](Torus x) -> Torus {
Torus y = (x >> i) & 1;
return y << final_offset;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
operator_f, gpu_memory_allocated);
lut_funs.push_back(operator_f);
lut_indices.push_back(i);
}
/**
@@ -1015,7 +1063,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
auto active_streams = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
lut->broadcast_lut(active_streams);
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
gpu_memory_allocated);
// lut->broadcast_lut(active_streams);
/**
* the input indexes should take the first bits_per_block PBS to target
@@ -1091,24 +1142,6 @@ template <typename Torus> struct int_fullprop_buffer {
};
//
Torus *lut_buffer_message = lut->get_lut(0, 0);
uint64_t *message_degree = lut->get_degree(0);
uint64_t *message_max_degree = lut->get_max_degree(0);
Torus *lut_buffer_carry = lut->get_lut(0, 1);
uint64_t *carry_degree = lut->get_degree(1);
uint64_t *carry_max_degree = lut->get_max_degree(1);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_buffer_message,
message_degree, message_max_degree, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
carry_max_degree, params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f_carry,
gpu_memory_allocated);
uint64_t lwe_indexes_size = 2 * sizeof(Torus);
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
@@ -1118,9 +1151,15 @@ template <typename Torus> struct int_fullprop_buffer {
cuda_memcpy_with_size_tracking_async_to_gpu(
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
//
// No broadcast is needed because full prop is done on 1 single GPU.
// By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
// not broadcast.
//
lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
{lut_f_message, lut_f_carry},
gpu_memory_allocated);
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -1238,9 +1277,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
if (total_ciphertexts > 0 ||
reduce_degrees_for_single_carry_propagation) {
uint64_t size_tracker = 0;
allocated_luts_message_carry = true;
luts_message_carry = new int_radix_lut<Torus>(
streams, params, 2, pbs_count, true, size_tracker);
allocated_luts_message_carry = true;
uint64_t message_modulus_bits =
(uint64_t)std::log2(params.message_modulus);
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1256,7 +1296,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
streams, upper_bound_num_blocks, size_tracker, true);
}
}
if (allocated_luts_message_carry) {
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
@@ -1268,22 +1310,11 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), message_acc,
luts_message_carry->get_degree(0),
luts_message_carry->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), carry_acc,
luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
auto active_gpu_count_mc =
streams.active_gpu_subset(pbs_count, params.pbs_type);
luts_message_carry->broadcast_lut(active_gpu_count_mc);
luts_message_carry->generate_and_broadcast_lut(
active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
gpu_memory_allocated);
}
}
int_sum_ciphertexts_vec_memory(
@@ -1418,10 +1449,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
uint32_t group_size, uint32_t big_lwe_size_bytes,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
grouping_size = group_size;
group_resolved_carries = new CudaRadixCiphertextFFI;
@@ -1431,22 +1458,20 @@ template <typename Torus> struct int_seq_group_prop_memory {
allocate_gpu_memory);
int num_seq_luts = grouping_size - 1;
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
lut_sequential_algorithm =
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
allocate_gpu_memory, size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
for (int index = 0; index < num_seq_luts; index++) {
auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
return (propa_cum_sum_block >> (index + 1)) & 1;
};
auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), seq_lut,
lut_sequential_algorithm->get_degree(index),
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
gpu_memory_allocated);
lut_funcs.push_back(f_lut_sequential);
h_seq_lut_indexes[index] = index;
lut_indices.push_back(index);
}
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1454,9 +1479,12 @@ template <typename Torus> struct int_seq_group_prop_memory {
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
lut_sequential_algorithm->broadcast_lut(active_streams);
lut_sequential_algorithm->generate_and_broadcast_lut(
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
// lut_sequential_algorithm->broadcast_lut(active_streams);
free(h_seq_lut_indexes);
};
}
void release(CudaStreams streams) {
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
group_resolved_carries,
@@ -1478,10 +1506,6 @@ template <typename Torus> struct int_hs_group_prop_memory {
uint32_t num_groups, uint32_t big_lwe_size_bytes,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
if (msb == 2) {
@@ -1501,16 +1525,11 @@ template <typename Torus> struct int_hs_group_prop_memory {
lut_hillis_steele = new int_radix_lut<Torus>(
streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_hillis_steele,
gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_groups, params.pbs_type);
lut_hillis_steele->broadcast_lut(active_streams);
};
lut_hillis_steele->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
}
void release(CudaStreams streams) {
lut_hillis_steele->release(streams);
@@ -1800,112 +1819,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
num_extra_luts = 1;
}
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
luts_array_second_step = new int_radix_lut<Torus>(
streams, params, num_luts_second_step, num_radix_blocks,
allocate_gpu_memory, size_tracker);
// luts for first group inner propagation
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
auto f_first_grouping_inner_propagation =
[lut_id](Torus propa_cum_sum_block) -> Torus {
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
if (carry != 0) {
return 2ull; // Generates Carry
} else {
return 0ull; // Does not generate carry
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_first_grouping_inner_propagation, gpu_memory_allocated);
}
auto f_first_grouping_outer_propagation =
[num_bits_in_block](Torus block) -> Torus {
return (block >> (num_bits_in_block - 1)) & 1;
};
int lut_id = grouping_size - 1;
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_first_grouping_outer_propagation, gpu_memory_allocated);
// for other groupings inner propagation
for (int index = 0; index < grouping_size; index++) {
uint32_t lut_id = index + grouping_size;
auto f_other_groupings_inner_propagation =
[index](Torus propa_cum_sum_block) -> Torus {
uint64_t mask = (2 << index) - 1;
if (propa_cum_sum_block >= (2 << index)) {
return 2ull; // Generates
} else if ((propa_cum_sum_block & mask) == mask) {
return 1ull; // Propagate
} else {
return 0ull; // Nothing
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_other_groupings_inner_propagation, gpu_memory_allocated);
}
if (use_sequential_algorithm_to_resolve_group_carries) {
for (int index = 0; index < grouping_size - 1; index++) {
uint32_t lut_id = index + 2 * grouping_size;
auto f_group_propagation = [index, block_modulus,
num_bits_in_block](Torus block) -> Torus {
if (block == (block_modulus - 1)) {
return 0ull;
} else {
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_group_propagation, gpu_memory_allocated);
}
} else {
uint32_t lut_id = 2 * grouping_size;
auto f_group_propagation = [block_modulus](Torus block) {
if (block == (block_modulus - 1)) {
return 2ull;
} else {
return UINT64_MAX % (block_modulus * 2ull);
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
gpu_memory_allocated);
}
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
for (int index = 0; index < num_radix_blocks; index++) {
@@ -1941,6 +1854,11 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
}
}
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
luts_array_second_step = new int_radix_lut<Torus>(
streams, params, num_luts_second_step, num_radix_blocks,
allocate_gpu_memory, size_tracker);
// copy the indexes to the gpu
Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1951,9 +1869,92 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
scalar_array_cum_sum, h_scalar_array_cum_sum,
num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_ids;
// luts for first group inner propagation
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
auto f_first_grouping_inner_propagation =
[lut_id](Torus propa_cum_sum_block) -> Torus {
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
if (carry != 0) {
return 2ull; // Generates Carry
} else {
return 0ull; // Does not generate carry
}
};
lut_funcs.push_back(f_first_grouping_inner_propagation);
lut_ids.push_back(lut_id);
}
auto f_first_grouping_outer_propagation =
[num_bits_in_block](Torus block) -> Torus {
return (block >> (num_bits_in_block - 1)) & 1;
};
int lut_id = grouping_size - 1;
lut_funcs.push_back(f_first_grouping_outer_propagation);
lut_ids.push_back(lut_id);
// for other groupings inner propagation
for (int index = 0; index < grouping_size; index++) {
uint32_t lut_id = index + grouping_size;
auto f_other_groupings_inner_propagation =
[index](Torus propa_cum_sum_block) -> Torus {
uint64_t mask = (2 << index) - 1;
if (propa_cum_sum_block >= (2 << index)) {
return 2ull; // Generates
} else if ((propa_cum_sum_block & mask) == mask) {
return 1ull; // Propagate
} else {
return 0ull; // Nothing
}
};
lut_funcs.push_back(f_other_groupings_inner_propagation);
lut_ids.push_back(lut_id);
}
if (use_sequential_algorithm_to_resolve_group_carries) {
for (int index = 0; index < grouping_size - 1; index++) {
uint32_t lut_id = index + 2 * grouping_size;
auto f_group_propagation = [index, block_modulus,
num_bits_in_block](Torus block) -> Torus {
if (block == (block_modulus - 1)) {
return 0ull;
} else {
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
}
};
lut_funcs.push_back(f_group_propagation);
lut_ids.push_back(lut_id);
}
} else {
uint32_t lut_id = 2 * grouping_size;
auto f_group_propagation = [block_modulus](Torus block) {
if (block == (block_modulus - 1)) {
return 2ull;
} else {
return UINT64_MAX % (block_modulus * 2ull);
}
};
lut_funcs.push_back(f_group_propagation);
lut_ids.push_back(lut_id);
}
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array_second_step->broadcast_lut(active_streams);
luts_array_second_step->generate_and_broadcast_lut(
active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
// luts_array_second_step->broadcast_lut(active_streams);
if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -2041,12 +2042,28 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t requested_flag;
bool gpu_memory_allocated;
void setup_message_extract_indices_for_carry_async(CudaStreams streams,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
}
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
@@ -2069,24 +2086,6 @@ template <typename Torus> struct int_sc_prop_memory {
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
// Step 3 elements
int num_luts_message_extract =
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
lut_message_extract = new int_radix_lut<Torus>(
streams, params, num_luts_message_extract, num_radix_blocks + 1,
allocate_gpu_memory, size_tracker);
// lut for the first block in the first grouping
auto f_message_extract = [message_modulus](Torus block) -> Torus {
return (block >> 1) % message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
// This store a single block that with be used to store the overflow or
// carry results
output_flag = new CudaRadixCiphertextFFI;
@@ -2137,22 +2136,30 @@ template <typename Torus> struct int_sc_prop_memory {
return output1 << 3 | output2 << 2;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_overflow_flag_prep->get_lut(0, 0),
lut_overflow_flag_prep->get_degree(0),
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
lut_overflow_flag_prep->broadcast_lut(active_streams);
lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
}
// Step 3 elements
int num_luts_message_extract =
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
lut_message_extract = new int_radix_lut<Torus>(
streams, params, num_luts_message_extract, num_radix_blocks + 1,
allocate_gpu_memory, size_tracker);
// lut for the first block in the first grouping
auto f_message_extract = [message_modulus](Torus block) -> Torus {
return (block >> 1) % message_modulus;
};
auto active_streams =
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
// For the final cleanup in case of overflow or carry (it seems that I can)
// It seems that this lut could be apply together with the other one but for
// now we won't do it
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
switch (requested_flag) {
case outputFlag::FLAG_OVERFLOW: { // Overflow case
auto f_overflow_last = [num_radix_blocks,
requested_flag_in](Torus block) -> Torus {
uint32_t position = (num_radix_blocks == 1 &&
@@ -2164,62 +2171,38 @@ template <typename Torus> struct int_sc_prop_memory {
Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
if (input_carry == outputFlag::FLAG_OVERFLOW) {
return does_overflow_if_carry_is_1;
} else {
return does_overflow_if_carry_is_0;
}
return does_overflow_if_carry_is_0;
};
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 1),
lut_message_extract->get_degree(1),
lut_message_extract->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0, 1}, {f_message_extract, f_overflow_last},
gpu_memory_allocated);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
break;
}
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
case outputFlag::FLAG_CARRY: { // Carry case
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
allocate_gpu_memory);
auto f_carry_last = [](Torus block) -> Torus {
return ((block >> 2) & 1);
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 1),
lut_message_extract->get_degree(1),
lut_message_extract->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_carry_last,
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0, 1}, {f_message_extract, f_carry_last},
gpu_memory_allocated);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
break;
}
auto active_streams =
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
lut_message_extract->broadcast_lut(active_streams);
default:
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
break;
}
// lut_message_extract->broadcast_lut(active_streams);
};
void release(CudaStreams streams) {
@@ -2517,16 +2500,11 @@ template <typename Torus> struct int_borrow_prop_memory {
return (block >> 1) % message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut_message_extract->broadcast_lut(active_streams);
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
if (compute_overflow) {
lut_borrow_flag =
@@ -2537,12 +2515,8 @@ template <typename Torus> struct int_borrow_prop_memory {
return ((block >> 2) & 1);
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
lut_borrow_flag->broadcast_lut(active_streams);
lut_borrow_flag->generate_and_broadcast_lut(
active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
}
active_streams =

View File

@@ -37,17 +37,14 @@ template <typename Torus> struct int_mul_memory {
zero_out_predicate_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_predicate_lut->get_lut(0, 0),
zero_out_predicate_lut->get_degree(0),
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {zero_out_predicate_lut_f},
gpu_memory_allocated);
// zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -55,10 +52,7 @@ template <typename Torus> struct int_mul_memory {
return;
}
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// 'vector_result_lsb' contains blocks from all possible shifts of
// radix_lwe_left excluding zero ciphertext blocks
@@ -102,18 +96,6 @@ template <typename Torus> struct int_mul_memory {
return (x * y) / message_modulus;
};
// generate accumulators
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lsb_acc,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_lsb, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), msb_acc,
luts_array->get_degree(1), luts_array->get_max_degree(1),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_msb, gpu_memory_allocated);
// lut_indexes_vec for luts_array should be reinitialized
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
@@ -123,9 +105,12 @@ template <typename Torus> struct int_mul_memory {
streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
auto active_streams =
streams.active_gpu_subset(total_block_count, params.pbs_type);
luts_array->broadcast_lut(active_streams);
luts_array->generate_and_broadcast_bivariate_lut(
active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, params, num_radix_blocks, 2 * num_radix_blocks,

View File

@@ -85,15 +85,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->broadcast_lut(active_streams);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
@@ -172,16 +168,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->broadcast_lut(active_streams);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
}
@@ -271,16 +261,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return shifted | padding;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_last_block_lut_univariate->get_lut(0, 0),
shift_last_block_lut_univariate->get_degree(0),
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
auto active_streams_shift_last =
streams.active_gpu_subset(1, params.pbs_type);
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
shift_last_block_lut_univariate->generate_and_broadcast_lut(
active_streams_shift_last, {0}, {last_block_lut_f},
gpu_memory_allocated);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
@@ -298,15 +283,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return (params.message_modulus - 1) * x_sign_bit;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
padding_block_lut_univariate->get_lut(0, 0),
padding_block_lut_univariate->get_degree(0),
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
padding_block_lut_univariate->broadcast_lut(active_streams);
padding_block_lut_univariate->generate_and_broadcast_lut(
active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -339,16 +317,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return message_of_current_block + carry_of_previous_block;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_blocks_lut_bivariate->get_lut(0, 0),
shift_blocks_lut_bivariate->get_degree(0),
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
auto active_streams_shift_blocks =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams_shift_blocks, {0}, {blocks_lut_f},
gpu_memory_allocated);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
}

View File

@@ -113,27 +113,21 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
else
return current_bit;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
;
auto active_gpu_count_mux = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
mux_lut->broadcast_lut(active_gpu_count_mux);
mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
gpu_memory_allocated);
auto cleaning_lut_f = [params](Torus x) -> Torus {
return x % params.message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
auto active_gpu_count_cleaning =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
cleaning_lut->generate_and_broadcast_lut(
active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
}
void release(CudaStreams streams) {

View File

@@ -74,45 +74,26 @@ template <typename Torus> struct int_overflowing_sub_memory {
luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_carry, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_does_block_generate_or_propagate, luts_array->get_degree(1),
luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
gpu_memory_allocated);
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, 1), 1,
num_radix_blocks - 1);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_borrow_propagation_sum->get_lut(0, 0),
luts_borrow_propagation_sum->get_degree(0),
luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_luts_borrow_propagation_sum, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
message_acc->get_degree(0), message_acc->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array->broadcast_lut(active_streams);
luts_borrow_propagation_sum->broadcast_lut(active_streams);
message_acc->broadcast_lut(active_streams);
luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_luts_borrow_propagation_sum},
gpu_memory_allocated);
luts_array->generate_and_broadcast_lut(
active_streams, {0, 1},
{f_lut_does_block_generate_carry,
f_lut_does_block_generate_or_propagate},
gpu_memory_allocated);
// generate luts (aka accumulators)
message_acc->generate_and_broadcast_lut(
active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
}
void release(CudaStreams streams) {

View File

@@ -298,14 +298,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
id_fn, allocate_gpu_memory);
lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
allocate_gpu_memory);
lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->stream_identity_luts[i] = lut;
}
@@ -318,27 +314,17 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->message_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->message_extract_lut->get_lut(0, 0),
this->message_extract_lut->get_degree(0),
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
msg_fn, allocate_gpu_memory);
this->message_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->message_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
allocate_gpu_memory);
this->carry_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->carry_extract_lut->get_lut(0, 0),
this->carry_extract_lut->get_degree(0),
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
carry_fn, allocate_gpu_memory);
this->carry_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->carry_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
allocate_gpu_memory);
this->partial_aggregated_vectors =
new CudaRadixCiphertextFFI *[num_streams];
@@ -1185,15 +1171,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, allocate_gpu_memory);
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1203,14 +1183,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, allocate_gpu_memory);
}
void release(CudaStreams streams) {
@@ -1376,15 +1351,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, allocate_gpu_memory);
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1394,14 +1363,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, allocate_gpu_memory);
}
void release(CudaStreams streams) {

View File

@@ -30,15 +30,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and =
streams.active_gpu_subset(total_lut_ops, params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -50,15 +45,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
return x & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush =
streams.active_gpu_subset(total_flush_ops, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -174,40 +174,6 @@ template <typename Torus> struct zk_expand_mem {
message_and_carry_extract_luts = new int_radix_lut<Torus>(
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 0),
message_and_carry_extract_luts->get_degree(0),
message_and_carry_extract_luts->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 1),
message_and_carry_extract_luts->get_degree(1),
message_and_carry_extract_luts->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 2),
message_and_carry_extract_luts->get_degree(2),
message_and_carry_extract_luts->get_max_degree(2),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 3),
message_and_carry_extract_luts->get_degree(3),
message_and_carry_extract_luts->get_max_degree(3),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
gpu_memory_allocated);
// We are always packing two LWEs. We just need to be sure we have enough
// space in the carry part to store a message of the same size as is in the
// message part.
@@ -292,7 +258,13 @@ template <typename Torus> struct zk_expand_mem {
auto active_streams =
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
message_and_carry_extract_luts->broadcast_lut(active_streams);
message_and_carry_extract_luts->generate_and_broadcast_lut(
active_streams, {0, 1, 2, 3},
{message_extract_lut_f, carry_extract_lut_f,
message_extract_and_sanitize_bool_lut_f,
carry_extract_and_sanitize_bool_lut_f},
gpu_memory_allocated);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);

View File

@@ -1067,6 +1067,85 @@ void generate_device_accumulator_bivariate(
POP_RANGE()
}
template <typename Torus> struct int_lut_cache {
int_lut_cache() {}
Torus *get_cached_univariate_lut(std::function<Torus(Torus)> &f, uint64_t *degree,
uint64_t *max_degree, uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t input_message_modulus,
uint32_t input_carry_modulus,
uint32_t output_message_modulus,
uint32_t output_carry_modulus) {
/*__int128_t f_hash = 0;
uint32_t bits_per_lut_val = 5;
uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
Torus f_eval = f(i);
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
"LUT value expected bitwidth overflow");
f_hash |= f_eval;
f_hash <<= bits_per_lut_val;
}
std::lock_guard cache_lock(_mutex);
if (_lut_cache.find(f_hash) != _lut_cache.end()) {
lut_ptr &ptr = _lut_cache[f_hash];
GPU_ASSERT(ptr.output_message_modulus == output_message_modulus,
"Error modulus");
GPU_ASSERT(ptr.input_message_modulus == input_message_modulus,
"Error modulus");
GPU_ASSERT(ptr.glwe_dimension == glwe_dimension, "Error modulus");
*max_degree = ptr.max_degree;
*degree = ptr.degree;
return ptr.ptr;
}*/
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
*max_degree = input_message_modulus * input_carry_modulus - 1;
*degree = generate_lookup_table_with_encoding<Torus>(
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
/*lut_ptr new_ptr = {h_lut,
glwe_dimension,
input_message_modulus,
input_carry_modulus,
output_message_modulus,
output_carry_modulus,
*max_degree,
*degree};*/
//_lut_cache[f_hash] = new_ptr;
return h_lut;
}
~int_lut_cache() {
std::lock_guard cache_lock(_mutex);
for (auto v : _lut_cache) {
free(v.second.ptr);
}
_lut_cache.clear();
}
private:
struct lut_ptr {
Torus *ptr;
uint32_t glwe_dimension;
uint32_t input_message_modulus;
uint32_t input_carry_modulus;
uint32_t output_message_modulus;
uint32_t output_carry_modulus;
uint64_t max_degree;
uint64_t degree;
};
std::map<__int128_t, lut_ptr> _lut_cache;
std::mutex _mutex;
};
static int_lut_cache<uint64_t> g_LutCache64;
/*
* generate bivariate accumulator with factor scaling for device pointer
* v_stream - cuda stream
@@ -1098,8 +1177,8 @@ void generate_device_accumulator_bivariate_with_factor(
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
// cuda_synchronize_stream(stream, gpu_index);
// free(h_lut);
}
/*
* generate bivariate accumulator for device pointer
@@ -1145,23 +1224,36 @@ void generate_device_accumulator_with_encoding(
uint32_t output_message_modulus, uint32_t output_carry_modulus,
std::function<Torus(Torus)> f, bool gpu_memory_allocated) {
static constexpr auto is_u64 = std::is_same_v<Torus, uint64_t>;
Torus *h_lut = nullptr;
// host lut
Torus *h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
*max_degree = input_message_modulus * input_carry_modulus - 1;
// fill accumulator
*degree = generate_lookup_table_with_encoding<Torus>(
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
if constexpr (is_u64) {
h_lut = g_LutCache64.get_cached_univariate_lut(
f, degree, max_degree, glwe_dimension, polynomial_size,
input_message_modulus, input_carry_modulus, output_message_modulus,
output_carry_modulus);
} else {
h_lut =
(Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
*max_degree = input_message_modulus * input_carry_modulus - 1;
// fill accumulator
*degree = generate_lookup_table_with_encoding<Torus>(
h_lut, glwe_dimension, polynomial_size, input_message_modulus,
input_carry_modulus, output_message_modulus, output_carry_modulus, f);
}
/*
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
*/
if (!std::is_same_v<Torus, uint64_t>) {
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
}
template <typename Torus>
void generate_device_accumulator_with_encoding_with_cpu_prealloc(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
@@ -1264,8 +1356,8 @@ void generate_many_lut_device_accumulator(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
//cuda_synchronize_stream(stream, gpu_index);
//free(h_lut);
POP_RANGE()
}