mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 22:57:59 -05:00
348 lines
14 KiB
C++
348 lines
14 KiB
C++
#include "integer_utilities.h"
|
|
|
|
template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
|
|
int_radix_lut<Torus> *univ_lut_mem;
|
|
int_radix_lut<Torus> *biv_lut_mem;
|
|
|
|
Direction direction;
|
|
BitValue bit_value;
|
|
|
|
CudaRadixCiphertextFFI *tmp_ct;
|
|
|
|
int_prepare_count_of_consecutive_bits_buffer(
|
|
CudaStreams streams, const int_radix_params params,
|
|
uint32_t num_radix_blocks, Direction direction, BitValue bit_value,
|
|
const bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->direction = direction;
|
|
this->bit_value = bit_value;
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
this->univ_lut_mem =
|
|
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
this->biv_lut_mem =
|
|
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
const uint32_t num_bits = std::log2(this->params.message_modulus);
|
|
|
|
auto generate_uni_lut_lambda = [this, num_bits](Torus x) -> Torus {
|
|
x %= this->params.message_modulus;
|
|
uint64_t count = 0;
|
|
|
|
if (this->direction == Trailing) {
|
|
for (uint32_t i = 0; i < num_bits; ++i) {
|
|
if (((x >> i) & 1) != this->bit_value) {
|
|
break;
|
|
}
|
|
count++;
|
|
}
|
|
} else {
|
|
for (int32_t i = num_bits - 1; i >= 0; --i) {
|
|
if (((x >> i) & 1) != this->bit_value) {
|
|
break;
|
|
}
|
|
count++;
|
|
}
|
|
}
|
|
return count;
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
|
|
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
|
|
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
|
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
|
|
|
|
univ_lut_mem->broadcast_lut(active_streams);
|
|
|
|
auto generate_bi_lut_lambda =
|
|
[num_bits](Torus block_num_bit_count,
|
|
Torus more_significant_block_bit_count) -> Torus {
|
|
if (more_significant_block_bit_count == num_bits) {
|
|
return block_num_bit_count;
|
|
}
|
|
return 0;
|
|
};
|
|
|
|
generate_device_accumulator_bivariate<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
|
|
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
|
|
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
|
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
|
|
|
|
biv_lut_mem->broadcast_lut(active_streams);
|
|
|
|
this->tmp_ct = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), tmp_ct, num_radix_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
univ_lut_mem->release(streams);
|
|
delete univ_lut_mem;
|
|
|
|
biv_lut_mem->release(streams);
|
|
delete biv_lut_mem;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
tmp_ct, allocate_gpu_memory);
|
|
delete tmp_ct;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_count_of_consecutive_bits_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t counter_num_blocks;
|
|
|
|
int_prepare_count_of_consecutive_bits_buffer<Torus> *prepare_mem = nullptr;
|
|
CudaRadixCiphertextFFI *ct_prepared = nullptr;
|
|
|
|
int_sum_ciphertexts_vec_memory<Torus> *sum_mem = nullptr;
|
|
int_sc_prop_memory<Torus> *propagate_mem = nullptr;
|
|
CudaRadixCiphertextFFI *cts = nullptr;
|
|
|
|
int_count_of_consecutive_bits_buffer(CudaStreams streams,
|
|
const int_radix_params params,
|
|
uint32_t num_radix_blocks,
|
|
uint32_t counter_num_blocks,
|
|
Direction direction, BitValue bit_value,
|
|
const bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->counter_num_blocks = counter_num_blocks;
|
|
|
|
this->ct_prepared = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), ct_prepared, num_radix_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
|
|
this->prepare_mem = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
|
|
streams, params, num_radix_blocks, direction, bit_value,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->cts = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), cts,
|
|
counter_num_blocks * num_radix_blocks, params.big_lwe_dimension,
|
|
size_tracker, allocate_gpu_memory);
|
|
|
|
this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
|
streams, params, counter_num_blocks, num_radix_blocks, true,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->propagate_mem = new int_sc_prop_memory<Torus>(
|
|
streams, params, counter_num_blocks, FLAG_NONE, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
ct_prepared, allocate_gpu_memory);
|
|
delete ct_prepared;
|
|
ct_prepared = nullptr;
|
|
|
|
prepare_mem->release(streams);
|
|
delete prepare_mem;
|
|
prepare_mem = nullptr;
|
|
|
|
sum_mem->release(streams);
|
|
delete sum_mem;
|
|
sum_mem = nullptr;
|
|
|
|
propagate_mem->release(streams);
|
|
delete propagate_mem;
|
|
propagate_mem = nullptr;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), cts,
|
|
allocate_gpu_memory);
|
|
delete cts;
|
|
cts = nullptr;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
template <typename Torus> struct int_ilog2_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t input_num_blocks;
|
|
uint32_t counter_num_blocks;
|
|
uint32_t num_bits_in_ciphertext;
|
|
|
|
int_prepare_count_of_consecutive_bits_buffer<Torus> *prepare_mem;
|
|
int_sum_ciphertexts_vec_memory<Torus> *sum_mem;
|
|
int_fullprop_buffer<Torus> *final_propagate_mem;
|
|
|
|
CudaRadixCiphertextFFI *ct_in_buffer;
|
|
CudaRadixCiphertextFFI *sum_input_cts;
|
|
CudaRadixCiphertextFFI *sum_output_not_propagated;
|
|
CudaRadixCiphertextFFI *message_blocks_not;
|
|
CudaRadixCiphertextFFI *carry_blocks_not;
|
|
CudaRadixCiphertextFFI *rotated_carry_blocks;
|
|
|
|
int_radix_lut<Torus> *lut_message_not;
|
|
int_radix_lut<Torus> *lut_carry_not;
|
|
|
|
int_ilog2_buffer(CudaStreams streams, const int_radix_params params,
|
|
uint32_t input_num_blocks, uint32_t counter_num_blocks,
|
|
uint32_t num_bits_in_ciphertext,
|
|
const bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->input_num_blocks = input_num_blocks;
|
|
this->counter_num_blocks = counter_num_blocks;
|
|
this->num_bits_in_ciphertext = num_bits_in_ciphertext;
|
|
|
|
this->ct_in_buffer = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->ct_in_buffer,
|
|
input_num_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->prepare_mem = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
|
|
streams, params, input_num_blocks, Leading, Zero, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
uint32_t sum_input_total_blocks =
|
|
(input_num_blocks + 1) * counter_num_blocks;
|
|
this->sum_input_cts = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->sum_input_cts,
|
|
sum_input_total_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
|
streams, params, counter_num_blocks, input_num_blocks + 1, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->sum_output_not_propagated = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->sum_output_not_propagated, counter_num_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
|
|
this->lut_message_not =
|
|
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
std::function<Torus(Torus)> lut_message_lambda =
|
|
[this](uint64_t x) -> uint64_t {
|
|
uint64_t message = x % this->params.message_modulus;
|
|
return (~message) % this->params.message_modulus;
|
|
};
|
|
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
|
|
this->lut_message_not->get_lut(0, 0),
|
|
this->lut_message_not->get_degree(0),
|
|
this->lut_message_not->get_max_degree(0),
|
|
params.glwe_dimension, params.polynomial_size,
|
|
params.message_modulus, params.carry_modulus,
|
|
lut_message_lambda, allocate_gpu_memory);
|
|
auto active_streams =
|
|
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
|
|
lut_message_not->broadcast_lut(active_streams);
|
|
|
|
this->lut_carry_not =
|
|
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
std::function<Torus(Torus)> lut_carry_lambda =
|
|
[this](uint64_t x) -> uint64_t {
|
|
uint64_t carry = x / this->params.message_modulus;
|
|
return (~carry) % this->params.message_modulus;
|
|
};
|
|
generate_device_accumulator(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
|
|
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
lut_carry_lambda, allocate_gpu_memory);
|
|
lut_carry_not->broadcast_lut(active_streams);
|
|
|
|
this->message_blocks_not = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->message_blocks_not,
|
|
counter_num_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->carry_blocks_not = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->carry_blocks_not,
|
|
counter_num_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->rotated_carry_blocks = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->rotated_carry_blocks,
|
|
counter_num_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->final_propagate_mem = new int_fullprop_buffer<Torus>(
|
|
streams, params, allocate_gpu_memory, size_tracker);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->ct_in_buffer, allocate_gpu_memory);
|
|
delete this->ct_in_buffer;
|
|
this->ct_in_buffer = nullptr;
|
|
|
|
this->prepare_mem->release(streams);
|
|
delete this->prepare_mem;
|
|
this->prepare_mem = nullptr;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->sum_input_cts, allocate_gpu_memory);
|
|
delete this->sum_input_cts;
|
|
this->sum_input_cts = nullptr;
|
|
|
|
this->sum_mem->release(streams);
|
|
delete this->sum_mem;
|
|
this->sum_mem = nullptr;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->sum_output_not_propagated,
|
|
allocate_gpu_memory);
|
|
delete this->sum_output_not_propagated;
|
|
this->sum_output_not_propagated = nullptr;
|
|
|
|
this->lut_message_not->release(streams);
|
|
delete this->lut_message_not;
|
|
this->lut_message_not = nullptr;
|
|
|
|
this->lut_carry_not->release(streams);
|
|
delete this->lut_carry_not;
|
|
this->lut_carry_not = nullptr;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->message_blocks_not,
|
|
allocate_gpu_memory);
|
|
delete this->message_blocks_not;
|
|
this->message_blocks_not = nullptr;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->carry_blocks_not, allocate_gpu_memory);
|
|
delete this->carry_blocks_not;
|
|
this->carry_blocks_not = nullptr;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->rotated_carry_blocks,
|
|
allocate_gpu_memory);
|
|
delete this->rotated_carry_blocks;
|
|
this->rotated_carry_blocks = nullptr;
|
|
|
|
this->final_propagate_mem->release(streams);
|
|
delete this->final_propagate_mem;
|
|
this->final_propagate_mem = nullptr;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|