mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
1 Commits
al/backup
...
tm/array-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
92163c2646 |
@@ -54,7 +54,8 @@ fn main() {
|
||||
}
|
||||
|
||||
if cfg!(feature = "debug") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "DEBUG");
|
||||
cmake_config.define("CMAKE_CXX_FLAGS", "-Wuninitialized -O0");
|
||||
}
|
||||
|
||||
// Build the CMake project
|
||||
|
||||
@@ -26,7 +26,6 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
|
||||
std::abort(); \
|
||||
}
|
||||
|
||||
uint32_t cuda_get_device();
|
||||
void cuda_set_device(uint32_t gpu_index);
|
||||
|
||||
cudaEvent_t cuda_create_event(uint32_t gpu_index);
|
||||
|
||||
@@ -124,7 +124,7 @@ template <typename Torus> struct int_decompression {
|
||||
encryption_params.carry_modulus, decompression_rescale_f,
|
||||
gpu_memory_allocated);
|
||||
|
||||
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
|
||||
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
@@ -20,36 +20,6 @@ public:
|
||||
static const uint64_t UNKNOWN = std::numeric_limits<uint64_t>::max();
|
||||
};
|
||||
|
||||
#ifdef DEBUG
|
||||
#define CHECK_NOISE_LEVEL(noise_level_expr, msg_mod, carry_mod) \
|
||||
do { \
|
||||
if ((msg_mod) == 2 && (carry_mod) == 2) { \
|
||||
constexpr int max_noise_level = 3; \
|
||||
if ((noise_level_expr) > max_noise_level) \
|
||||
PANIC("Cuda error: noise exceeds maximum authorized value for 1_1 " \
|
||||
"parameters"); \
|
||||
} else if ((msg_mod) == 4 && (carry_mod) == 4) { \
|
||||
constexpr int max_noise_level = 5; \
|
||||
if ((noise_level_expr) > max_noise_level) \
|
||||
PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 " \
|
||||
"parameters"); \
|
||||
} else if ((msg_mod) == 8 && (carry_mod) == 8) { \
|
||||
constexpr int max_noise_level = 9; \
|
||||
if ((noise_level_expr) > max_noise_level) \
|
||||
PANIC("Cuda error: noise exceeds maximum authorized value for 3_3 " \
|
||||
"parameters"); \
|
||||
} else if ((msg_mod) == 0 && (carry_mod) == 0) { \
|
||||
break; \
|
||||
} else { \
|
||||
PANIC("Invalid message modulus or carry modulus") \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define CHECK_NOISE_LEVEL(noise_level_expr, message_modulus, carry_modulus) \
|
||||
do { \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
||||
uint32_t value, uint32_t blocks_count,
|
||||
@@ -113,12 +83,8 @@ void generate_many_lut_device_accumulator(
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);
|
||||
|
||||
template <typename Torus> struct radix_columns {
|
||||
std::vector<std::vector<Torus>> columns;
|
||||
struct radix_columns {
|
||||
std::vector<uint32_t> columns_counter;
|
||||
std::vector<std::vector<Torus>> new_columns;
|
||||
std::vector<uint32_t> new_columns_counter;
|
||||
|
||||
uint32_t num_blocks;
|
||||
uint32_t num_radix_in_vec;
|
||||
uint32_t chunk_size;
|
||||
@@ -128,21 +94,14 @@ template <typename Torus> struct radix_columns {
|
||||
: num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
|
||||
chunk_size(chunk_size) {
|
||||
needs_processing = false;
|
||||
columns.resize(num_blocks);
|
||||
columns_counter.resize(num_blocks, 0);
|
||||
new_columns.resize(num_blocks);
|
||||
new_columns_counter.resize(num_blocks, 0);
|
||||
for (uint32_t i = 0; i < num_blocks; ++i) {
|
||||
new_columns[i].resize(num_radix_in_vec);
|
||||
}
|
||||
for (uint32_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (uint32_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j]) {
|
||||
columns[j].push_back(i * num_blocks + j);
|
||||
columns_counter[j]++;
|
||||
}
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns_counter[j] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_blocks; ++i) {
|
||||
if (columns_counter[i] > chunk_size) {
|
||||
needs_processing = true;
|
||||
@@ -151,96 +110,70 @@ template <typename Torus> struct radix_columns {
|
||||
}
|
||||
}
|
||||
|
||||
void next_accumulation(Torus *h_indexes_in, Torus *h_indexes_out,
|
||||
Torus *h_lut_indexes, uint32_t &total_ciphertexts,
|
||||
void next_accumulation(uint32_t &total_ciphertexts,
|
||||
uint32_t &message_ciphertexts,
|
||||
bool &needs_processing) {
|
||||
message_ciphertexts = 0;
|
||||
total_ciphertexts = 0;
|
||||
needs_processing = false;
|
||||
for (int i = num_blocks - 1; i > 0; --i) {
|
||||
uint32_t cur_count = columns_counter[i];
|
||||
uint32_t prev_count = columns_counter[i - 1];
|
||||
uint32_t new_count = 0;
|
||||
|
||||
uint32_t pbs_count = 0;
|
||||
for (uint32_t c_id = 0; c_id < num_blocks; ++c_id) {
|
||||
const uint32_t column_len = columns_counter[c_id];
|
||||
new_columns_counter[c_id] = 0;
|
||||
uint32_t ct_count = 0;
|
||||
// add message cts into new columns
|
||||
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
|
||||
const Torus in_index = columns[c_id][i];
|
||||
new_columns[c_id][ct_count] = in_index;
|
||||
if (h_indexes_in != nullptr)
|
||||
h_indexes_in[pbs_count] = in_index;
|
||||
if (h_indexes_out != nullptr)
|
||||
h_indexes_out[pbs_count] = in_index;
|
||||
if (h_lut_indexes != nullptr)
|
||||
h_lut_indexes[pbs_count] = 0;
|
||||
++pbs_count;
|
||||
++ct_count;
|
||||
++message_ciphertexts;
|
||||
}
|
||||
// accumulated_blocks from current columns
|
||||
new_count += cur_count / chunk_size;
|
||||
// all accumulated message blocks needs pbs
|
||||
message_ciphertexts += new_count;
|
||||
// carry blocks from previous columns
|
||||
new_count += prev_count / chunk_size;
|
||||
// both carry and message blocks that needs pbs
|
||||
total_ciphertexts += new_count;
|
||||
// now add remaining non accumulated blocks that does not require pbs
|
||||
new_count += cur_count % chunk_size;
|
||||
|
||||
// add carry cts into new columns
|
||||
if (c_id > 0) {
|
||||
const uint32_t prev_c_id = c_id - 1;
|
||||
const uint32_t prev_column_len = columns_counter[prev_c_id];
|
||||
for (uint32_t i = 0; i + chunk_size <= prev_column_len;
|
||||
i += chunk_size) {
|
||||
const Torus in_index = columns[prev_c_id][i];
|
||||
const Torus out_index = columns[prev_c_id][i + 1];
|
||||
new_columns[c_id][ct_count] = out_index;
|
||||
if (h_indexes_in != nullptr)
|
||||
h_indexes_in[pbs_count] = in_index;
|
||||
if (h_indexes_out != nullptr)
|
||||
h_indexes_out[pbs_count] = out_index;
|
||||
if (h_lut_indexes != nullptr)
|
||||
h_lut_indexes[pbs_count] = 1;
|
||||
++pbs_count;
|
||||
++ct_count;
|
||||
}
|
||||
}
|
||||
columns_counter[i] = new_count;
|
||||
|
||||
// add remaining cts into new columns
|
||||
const uint32_t start_index = column_len - column_len % chunk_size;
|
||||
for (uint32_t i = start_index; i < column_len; ++i) {
|
||||
new_columns[c_id][ct_count] = columns[c_id][i];
|
||||
++ct_count;
|
||||
}
|
||||
new_columns_counter[c_id] = ct_count;
|
||||
if (ct_count > chunk_size) {
|
||||
if (new_count > chunk_size)
|
||||
needs_processing = true;
|
||||
}
|
||||
|
||||
new_columns_counter[c_id] = ct_count;
|
||||
}
|
||||
|
||||
total_ciphertexts = pbs_count;
|
||||
swap(columns, new_columns);
|
||||
swap(columns_counter, new_columns_counter);
|
||||
// now do it for 0th block
|
||||
uint32_t new_count = columns_counter[0] / chunk_size;
|
||||
message_ciphertexts += new_count;
|
||||
total_ciphertexts += new_count;
|
||||
new_count += columns_counter[0] % chunk_size;
|
||||
columns_counter[0] = new_count;
|
||||
|
||||
if (new_count > chunk_size) {
|
||||
needs_processing = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline void calculate_final_degrees(uint64_t *const out_degrees,
|
||||
const uint64_t *const input_degrees,
|
||||
size_t num_blocks, size_t num_radix_in_vec,
|
||||
size_t chunk_size,
|
||||
uint32_t num_blocks,
|
||||
uint32_t num_radix_in_vec,
|
||||
uint32_t chunk_size,
|
||||
uint64_t message_modulus) {
|
||||
|
||||
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
|
||||
return std::min(message_modulus - 1, degree);
|
||||
};
|
||||
std::vector<std::queue<uint64_t>> columns(num_blocks);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
for (uint32_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (uint32_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns[j].push(input_degrees[i * num_blocks + j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
for (uint32_t i = 0; i < num_blocks; ++i) {
|
||||
auto &col = columns[i];
|
||||
while (col.size() > 1) {
|
||||
uint32_t cur_degree = 0;
|
||||
size_t mn = std::min(chunk_size, col.size());
|
||||
uint32_t mn = std::min(chunk_size, (uint32_t)col.size());
|
||||
for (int j = 0; j < mn; ++j) {
|
||||
cur_degree += col.front();
|
||||
col.pop();
|
||||
@@ -723,10 +656,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *h_indexes_in, Torus *h_indexes_out) {
|
||||
|
||||
if (h_indexes_in != h_lwe_indexes_in)
|
||||
memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
|
||||
if (h_indexes_out != h_lwe_indexes_out)
|
||||
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
|
||||
memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
|
||||
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lwe_indexes_in, h_lwe_indexes_in, num_blocks * sizeof(Torus), stream,
|
||||
@@ -738,18 +669,17 @@ template <typename Torus> struct int_radix_lut {
|
||||
using_trivial_lwe_indexes = false;
|
||||
}
|
||||
|
||||
// Broadcast luts from device gpu_indexes[0] to all active gpus
|
||||
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
|
||||
int active_device = cuda_get_device();
|
||||
|
||||
// Broadcast luts from gpu src_gpu_idx to all active gpus
|
||||
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t src_gpu_idx) {
|
||||
Torus lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
||||
|
||||
auto src_lut = lut_vec[0];
|
||||
auto src_lut_indexes = lut_indexes_vec[0];
|
||||
auto src_lut = lut_vec[src_gpu_idx];
|
||||
auto src_lut_indexes = lut_indexes_vec[src_gpu_idx];
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
if (gpu_indexes[i] != gpu_indexes[0]) {
|
||||
if (i != src_gpu_idx) {
|
||||
auto dst_lut = lut_vec[i];
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
@@ -760,9 +690,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
streams[i], gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
// Ensure the device set at the end of this method is the same as it was set
|
||||
// at the beginning
|
||||
cuda_set_device(active_device);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1075,7 +1003,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
/**
|
||||
* the input indexes should take the first bits_per_block PBS to target
|
||||
@@ -1242,7 +1170,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
|
||||
mux_lut->broadcast_lut(streams, gpu_indexes);
|
||||
mux_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
auto cleaning_lut_f = [params](Torus x) -> Torus {
|
||||
return x % params.message_modulus;
|
||||
@@ -1252,7 +1180,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
|
||||
cleaning_lut->broadcast_lut(streams, gpu_indexes);
|
||||
cleaning_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1342,7 +1270,7 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -1477,9 +1405,9 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_message_acc, gpu_memory_allocated);
|
||||
|
||||
luts_array->broadcast_lut(streams, gpu_indexes);
|
||||
luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes);
|
||||
message_acc->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes, 0);
|
||||
message_acc->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1526,6 +1454,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
// lookup table for extracting message and carry
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
|
||||
bool mem_reuse = false;
|
||||
bool allocated_luts_message_carry;
|
||||
|
||||
void setup_index_buffers(cudaStream_t const *streams,
|
||||
@@ -1580,23 +1509,24 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
const uint64_t *const degrees) {
|
||||
uint32_t message_modulus = params.message_modulus;
|
||||
bool _needs_processing = false;
|
||||
radix_columns<Torus> current_columns(degrees, num_blocks_in_radix,
|
||||
num_radix_in_vec, chunk_size,
|
||||
_needs_processing);
|
||||
radix_columns current_columns(degrees, num_blocks_in_radix,
|
||||
num_radix_in_vec, chunk_size,
|
||||
_needs_processing);
|
||||
uint32_t total_ciphertexts = 0;
|
||||
uint32_t total_messages = 0;
|
||||
|
||||
current_columns.next_accumulation(nullptr, nullptr, nullptr,
|
||||
total_ciphertexts, total_messages,
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
_needs_processing);
|
||||
|
||||
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
|
||||
if (total_ciphertexts > 0 || reduce_degrees_for_single_carry_propagation) {
|
||||
uint64_t size_tracker = 0;
|
||||
luts_message_carry =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
|
||||
pbs_count, true, size_tracker);
|
||||
allocated_luts_message_carry = true;
|
||||
if (!mem_reuse) {
|
||||
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
|
||||
if (total_ciphertexts > 0 ||
|
||||
reduce_degrees_for_single_carry_propagation) {
|
||||
uint64_t size_tracker = 0;
|
||||
luts_message_carry =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
|
||||
pbs_count, true, size_tracker);
|
||||
allocated_luts_message_carry = true;
|
||||
}
|
||||
}
|
||||
if (allocated_luts_message_carry) {
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
@@ -1623,7 +1553,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
@@ -1633,6 +1563,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
bool reduce_degrees_for_single_carry_propagation,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
this->params = params;
|
||||
this->mem_reuse = false;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
this->num_blocks_in_radix = num_blocks_in_radix;
|
||||
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
||||
@@ -1666,6 +1597,32 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
|
||||
CudaRadixCiphertextFFI *small_lwe_vector,
|
||||
int_radix_lut<Torus> *reused_lut,
|
||||
bool reduce_degrees_for_single_carry_propagation,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
this->mem_reuse = true;
|
||||
this->params = params;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
this->num_blocks_in_radix = num_blocks_in_radix;
|
||||
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
|
||||
(params.message_modulus - 1);
|
||||
this->allocated_luts_message_carry = true;
|
||||
this->reduce_degrees_for_single_carry_propagation =
|
||||
reduce_degrees_for_single_carry_propagation;
|
||||
|
||||
this->current_blocks = current_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
this->luts_message_carry = reused_lut;
|
||||
setup_index_buffers(streams, gpu_indexes, size_tracker);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
|
||||
@@ -1684,16 +1641,18 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], small_lwe_vector,
|
||||
gpu_memory_allocated);
|
||||
if (allocated_luts_message_carry) {
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete luts_message_carry;
|
||||
if (!mem_reuse) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
small_lwe_vector, gpu_memory_allocated);
|
||||
if (allocated_luts_message_carry) {
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete luts_message_carry;
|
||||
}
|
||||
delete current_blocks;
|
||||
delete small_lwe_vector;
|
||||
}
|
||||
delete current_blocks;
|
||||
delete small_lwe_vector;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1745,7 +1704,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
|
||||
lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes);
|
||||
lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes, 0);
|
||||
free(h_seq_lut_indexes);
|
||||
};
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1801,7 +1760,7 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_lut_hillis_steele, gpu_memory_allocated);
|
||||
|
||||
lut_hillis_steele->broadcast_lut(streams, gpu_indexes);
|
||||
lut_hillis_steele->broadcast_lut(streams, gpu_indexes, 0);
|
||||
};
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -1978,7 +1937,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes, 0);
|
||||
};
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -2239,7 +2198,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
scalar_array_cum_sum, h_scalar_array_cum_sum,
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
|
||||
@@ -2265,7 +2224,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
|
||||
@@ -2371,7 +2330,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// This store a single block that with be used to store the overflow or
|
||||
// carry results
|
||||
@@ -2430,7 +2389,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
||||
gpu_memory_allocated);
|
||||
|
||||
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
|
||||
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
// For the final cleanup in case of overflow or carry (it seems that I can)
|
||||
@@ -2473,7 +2432,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
|
||||
|
||||
@@ -2501,7 +2460,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -2698,7 +2657,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes, 0);
|
||||
};
|
||||
|
||||
// needed for the division to update the lut indexes
|
||||
@@ -2709,7 +2668,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], gpu_memory_allocated);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -2809,7 +2768,7 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
if (compute_overflow) {
|
||||
lut_borrow_flag = new int_radix_lut<Torus>(
|
||||
@@ -2826,7 +2785,7 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_borrow_flag, gpu_memory_allocated);
|
||||
|
||||
lut_borrow_flag->broadcast_lut(streams, gpu_indexes);
|
||||
lut_borrow_flag->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
@@ -2993,7 +2952,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated);
|
||||
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes);
|
||||
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
zero_out_mem = new int_zero_out_if_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
@@ -3067,12 +3026,12 @@ template <typename Torus> struct int_mul_memory {
|
||||
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
|
||||
msb_vector_block_count);
|
||||
|
||||
luts_array->broadcast_lut(streams, gpu_indexes);
|
||||
luts_array->broadcast_lut(streams, gpu_indexes, 0);
|
||||
// create memory object for sum ciphertexts
|
||||
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
2 * num_radix_blocks, true, allocate_gpu_memory, size_tracker);
|
||||
2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
|
||||
true, allocate_gpu_memory, size_tracker);
|
||||
uint32_t uses_carry = 0;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
@@ -3199,7 +3158,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
@@ -3283,7 +3242,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
@@ -3387,7 +3346,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
shift_last_block_lut_univariate->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
|
||||
shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
|
||||
shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
|
||||
}
|
||||
@@ -3412,7 +3371,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
padding_block_lut_f, gpu_memory_allocated);
|
||||
padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
|
||||
padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
lut_buffers_univariate.push_back(padding_block_lut_univariate);
|
||||
|
||||
@@ -3451,7 +3410,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
blocks_lut_f, gpu_memory_allocated);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
|
||||
}
|
||||
@@ -3566,8 +3525,8 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
predicate_lut->broadcast_lut(streams, gpu_indexes);
|
||||
message_extract_lut->broadcast_lut(streams, gpu_indexes);
|
||||
predicate_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
message_extract_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3639,7 +3598,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
|
||||
|
||||
is_max_value->broadcast_lut(streams, gpu_indexes);
|
||||
is_max_value->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3699,7 +3658,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, operator_f, gpu_memory_allocated);
|
||||
|
||||
operator_lut->broadcast_lut(streams, gpu_indexes);
|
||||
operator_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// f(x) -> x == 0
|
||||
Torus total_modulus = params.message_modulus * params.carry_modulus;
|
||||
@@ -3717,7 +3676,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
|
||||
|
||||
is_non_zero_lut->broadcast_lut(streams, gpu_indexes);
|
||||
is_non_zero_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// Scalar may have up to num_radix_blocks blocks
|
||||
scalar_comparison_luts = new int_radix_lut<Torus>(
|
||||
@@ -3737,7 +3696,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
lut_f, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
scalar_comparison_luts->broadcast_lut(streams, gpu_indexes);
|
||||
scalar_comparison_luts->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3811,7 +3770,7 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
block_selector_f, gpu_memory_allocated);
|
||||
|
||||
tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes);
|
||||
tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -4001,7 +3960,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
|
||||
|
||||
identity_lut->broadcast_lut(streams, gpu_indexes);
|
||||
identity_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
auto is_zero_f = [total_modulus](Torus x) -> Torus {
|
||||
@@ -4018,7 +3977,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_zero_f, gpu_memory_allocated);
|
||||
|
||||
is_zero_lut->broadcast_lut(streams, gpu_indexes);
|
||||
is_zero_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
switch (op) {
|
||||
case COMPARISON_TYPE::MAX:
|
||||
@@ -4101,7 +4060,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
|
||||
|
||||
signed_lut->broadcast_lut(streams, gpu_indexes);
|
||||
signed_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4325,7 +4284,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f_masking,
|
||||
gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4352,7 +4311,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
@@ -4388,7 +4347,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes);
|
||||
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
|
||||
0);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
|
||||
@@ -4397,7 +4357,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes);
|
||||
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
|
||||
0);
|
||||
|
||||
// create and generate zero_out_if_overflow_happened
|
||||
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
|
||||
@@ -4424,7 +4385,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes);
|
||||
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
zero_out_if_overflow_happened[1]->get_lut(0, 0),
|
||||
@@ -4433,7 +4394,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes);
|
||||
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// merge_overflow_flags_luts
|
||||
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
|
||||
@@ -4453,7 +4414,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
merge_overflow_flags_luts[i]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
|
||||
merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes);
|
||||
merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4797,7 +4758,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -4827,7 +4788,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5108,7 +5069,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
f_compare_extracted_signed_bits, gpu_memory_allocated);
|
||||
compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes);
|
||||
compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -232,7 +232,7 @@ template <typename Torus> struct zk_expand_mem {
|
||||
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
|
||||
@@ -2,12 +2,6 @@
|
||||
#include <cstdint>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
uint32_t cuda_get_device() {
|
||||
int device;
|
||||
check_cuda_error(cudaGetDevice(&device));
|
||||
return static_cast<uint32_t>(device);
|
||||
}
|
||||
|
||||
void cuda_set_device(uint32_t gpu_index) {
|
||||
check_cuda_error(cudaSetDevice(gpu_index));
|
||||
}
|
||||
|
||||
@@ -53,8 +53,7 @@ __host__ void host_integer_abs_kb(
|
||||
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
|
||||
ct->num_radix_blocks, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
ct->num_radix_blocks);
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
|
||||
@@ -84,8 +84,7 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
|
||||
&mem_false, num_radix_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
&mem_false, num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
|
||||
|
||||
@@ -148,7 +148,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
||||
h_lut_indexes, num_chunks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
lut = is_max_value_lut;
|
||||
}
|
||||
@@ -167,7 +167,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
is_max_value_lut->h_lut_indexes,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
||||
return;
|
||||
} else {
|
||||
@@ -499,7 +499,7 @@ __host__ void tree_sign_reduction(
|
||||
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
|
||||
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f, true);
|
||||
last_lut->broadcast_lut(streams, gpu_indexes);
|
||||
last_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
|
||||
@@ -268,11 +268,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// but in that position, interesting_remainder2 always has a 0
|
||||
auto merged_interesting_remainder = interesting_remainder1;
|
||||
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], merged_interesting_remainder,
|
||||
merged_interesting_remainder, interesting_remainder2,
|
||||
merged_interesting_remainder->num_radix_blocks,
|
||||
radix_params.message_modulus, radix_params.carry_modulus);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0],
|
||||
merged_interesting_remainder,
|
||||
merged_interesting_remainder, interesting_remainder2,
|
||||
merged_interesting_remainder->num_radix_blocks);
|
||||
|
||||
// after create_clean_version_of_merged_remainder
|
||||
// `merged_interesting_remainder` will be reused as
|
||||
@@ -383,10 +382,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], overflow_sum, subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, 1, radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum,
|
||||
subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, 1);
|
||||
|
||||
auto message_modulus = radix_params.message_modulus;
|
||||
int factor = (i) ? message_modulus - 1 : message_modulus - 2;
|
||||
@@ -436,9 +434,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
as_radix_ciphertext_slice<Torus>("ient_block, quotient, block_of_bit,
|
||||
block_of_bit + 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], "ient_block,
|
||||
"ient_block, mem_ptr->did_not_overflow, 1,
|
||||
radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
"ient_block, mem_ptr->did_not_overflow, 1);
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
@@ -481,9 +477,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// Clean the quotient and remainder
|
||||
// as even though they have no carries, they are not at nominal noise level
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1,
|
||||
remainder2, remainder1->num_radix_blocks,
|
||||
radix_params.message_modulus,
|
||||
radix_params.carry_modulus);
|
||||
remainder2, remainder1->num_radix_blocks);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
|
||||
@@ -409,8 +409,7 @@ __host__ void host_pack_bivariate_blocks(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, Torus const *lwe_indexes_in,
|
||||
uint32_t shift, uint32_t num_radix_blocks, uint32_t const message_modulus,
|
||||
uint32_t const carry_modulus) {
|
||||
uint32_t shift, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
@@ -434,15 +433,6 @@ __host__ void host_pack_bivariate_blocks(
|
||||
(Torus *)lwe_array_1->ptr, (Torus *)lwe_array_2->ptr, lwe_indexes_in,
|
||||
lwe_dimension, shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
lwe_array_out->degrees[i] =
|
||||
lwe_array_1->degrees[i] * shift + lwe_array_2->degrees[i];
|
||||
lwe_array_out->noise_levels[i] =
|
||||
lwe_array_1->noise_levels[i] * shift + lwe_array_2->noise_levels[i];
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
// polynomial_size threads
|
||||
@@ -607,8 +597,6 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
@@ -714,8 +702,6 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
@@ -764,7 +750,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
host_pack_bivariate_blocks<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
|
||||
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
|
||||
shift, num_radix_blocks, params.message_modulus, params.carry_modulus);
|
||||
shift, num_radix_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
@@ -832,8 +818,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
lwe_array_out->degrees[i] = lut->degrees[degrees_index];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
@@ -1462,8 +1446,6 @@ void host_full_propagate_inplace(
|
||||
auto degrees_index = mem_ptr->lut->h_lut_indexes[0];
|
||||
input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index];
|
||||
input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
|
||||
if (i < num_blocks - 1) {
|
||||
CudaRadixCiphertextFFI next_input_block;
|
||||
@@ -1474,8 +1456,7 @@ void host_full_propagate_inplace(
|
||||
mem_ptr->tmp_big_lwe_vector, 1, 2);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &next_input_block,
|
||||
&next_input_block, &second_input, 1,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
&next_input_block, &second_input, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1657,7 +1638,7 @@ __host__ void reduce_signs(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, reduce_two_orderings_function, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
@@ -1688,7 +1669,7 @@ __host__ void reduce_signs(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, final_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
num_sign_blocks, message_modulus);
|
||||
@@ -1708,7 +1689,7 @@ __host__ void reduce_signs(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, final_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
|
||||
@@ -1734,7 +1715,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -1770,7 +1751,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -1806,7 +1787,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -1866,8 +1847,7 @@ void host_propagate_single_carry(
|
||||
"pointer")
|
||||
if (uses_carry == 1) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
input_carries, 1, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
input_carries, 1);
|
||||
}
|
||||
|
||||
// Step 1
|
||||
@@ -1893,8 +1873,7 @@ void host_propagate_single_carry(
|
||||
auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
|
||||
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);
|
||||
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW ||
|
||||
requested_flag == outputFlag::FLAG_CARRY) {
|
||||
@@ -1903,8 +1882,7 @@ void host_propagate_single_carry(
|
||||
&shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
|
||||
&shifted_simulators, 1, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
&shifted_simulators, 1);
|
||||
}
|
||||
|
||||
host_radix_sum_in_groups<Torus>(
|
||||
@@ -1918,8 +1896,7 @@ void host_propagate_single_carry(
|
||||
mem->prop_simu_group_carries_mem->resolved_carries, mem->num_groups - 1,
|
||||
mem->num_groups);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
|
||||
&shifted_resolved_carries, 1, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
&shifted_resolved_carries, 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
|
||||
@@ -2000,13 +1977,11 @@ void host_add_and_propagate_single_carry(
|
||||
}
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
|
||||
rhs_array, num_radix_blocks, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
rhs_array, num_radix_blocks);
|
||||
|
||||
if (uses_carry == 1) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
|
||||
input_carries, 1, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
input_carries, 1);
|
||||
}
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
@@ -2037,8 +2012,7 @@ void host_add_and_propagate_single_carry(
|
||||
auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
|
||||
host_addition<Torus>(
|
||||
streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
|
||||
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);
|
||||
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW ||
|
||||
requested_flag == outputFlag::FLAG_CARRY) {
|
||||
@@ -2047,8 +2021,7 @@ void host_add_and_propagate_single_carry(
|
||||
&shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
|
||||
&shifted_simulators, 1, params.message_modulus,
|
||||
params.carry_modulus);
|
||||
&shifted_simulators, 1);
|
||||
}
|
||||
|
||||
// Step 3
|
||||
@@ -2063,8 +2036,7 @@ void host_add_and_propagate_single_carry(
|
||||
if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
|
||||
uses_carry == 1) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
|
||||
&output_flag, input_carries, 1,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
&output_flag, input_carries, 1);
|
||||
|
||||
} else {
|
||||
CudaRadixCiphertextFFI shifted_resolved_carries;
|
||||
@@ -2073,8 +2045,7 @@ void host_add_and_propagate_single_carry(
|
||||
mem->prop_simu_group_carries_mem->resolved_carries,
|
||||
mem->num_groups - 1, mem->num_groups);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
|
||||
&output_flag, &shifted_resolved_carries, 1,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
&output_flag, &shifted_resolved_carries, 1);
|
||||
}
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
|
||||
@@ -2179,8 +2150,7 @@ void host_single_borrow_propagate(
|
||||
&shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
|
||||
mem->overflow_block, &shifted_simulators, 1,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
mem->overflow_block, &shifted_simulators, 1);
|
||||
}
|
||||
CudaRadixCiphertextFFI resolved_borrows;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -2192,8 +2162,7 @@ void host_single_borrow_propagate(
|
||||
// borrows
|
||||
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
|
||||
mem->overflow_block, &resolved_borrows, 1,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
mem->overflow_block, &resolved_borrows, 1);
|
||||
}
|
||||
|
||||
cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
|
||||
@@ -2343,8 +2312,6 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
for (uint i = 0; i < lut->num_blocks; i++) {
|
||||
lwe_array_out->degrees[i] = lut->degrees[0];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
|
||||
params.carry_modulus);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -95,10 +95,17 @@ __global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_new_columns(
|
||||
__global__ inline void prepare_new_columns_and_pbs_indexes(
|
||||
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size) {
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter, const uint32_t chunk_size) {
|
||||
__shared__ uint32_t counter;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
counter = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t base_id = threadIdx.x;
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
@@ -109,6 +116,10 @@ __global__ inline void prepare_new_columns(
|
||||
// for message ciphertexts in and out index should be same
|
||||
const uint32_t in_index = columns[base_id][i];
|
||||
new_columns[base_id][ct_count] = in_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = in_index;
|
||||
lut_indexes[pbs_index] = 0;
|
||||
++ct_count;
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -124,6 +135,10 @@ __global__ inline void prepare_new_columns(
|
||||
const uint32_t in_index = columns[prev_base_id][i];
|
||||
const uint32_t out_index = columns[prev_base_id][i + 1];
|
||||
new_columns[base_id][ct_count] = out_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = out_index;
|
||||
lut_indexes[pbs_index] = 1;
|
||||
++ct_count;
|
||||
}
|
||||
}
|
||||
@@ -137,6 +152,16 @@ __global__ inline void prepare_new_columns(
|
||||
new_columns_counter[base_id] = ct_count;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_final_pbs_indexes(
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
|
||||
int idx = threadIdx.x;
|
||||
pbs_indexes_in[idx] = idx % num_radix_blocks;
|
||||
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
|
||||
lut_indexes[idx] = idx / num_radix_blocks;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_chunks(Torus *const input_blocks,
|
||||
const uint32_t *const *const columns,
|
||||
@@ -321,9 +346,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
|
||||
2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
|
||||
&terms_slice, num_radix_blocks,
|
||||
mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
&terms_slice, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -342,9 +365,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
num_radix_in_vec);
|
||||
|
||||
bool needs_processing = false;
|
||||
radix_columns<Torus> current_columns(current_blocks->degrees,
|
||||
num_radix_blocks, num_radix_in_vec,
|
||||
chunk_size, needs_processing);
|
||||
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
|
||||
num_radix_in_vec, chunk_size, needs_processing);
|
||||
int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
@@ -354,31 +376,22 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
while (needs_processing) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
chunk_size, big_lwe_size);
|
||||
|
||||
prepare_new_columns<Torus><<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_new_columns, d_new_columns_counter, d_columns, d_columns_counter,
|
||||
chunk_size);
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
|
||||
d_columns_counter, chunk_size);
|
||||
|
||||
uint32_t total_ciphertexts = 0;
|
||||
uint32_t total_messages = 0;
|
||||
auto h_pbs_indexes_in = mem_ptr->luts_message_carry->h_lwe_indexes_in;
|
||||
auto h_pbs_indexes_out = mem_ptr->luts_message_carry->h_lwe_indexes_out;
|
||||
auto h_lut_indexes = mem_ptr->luts_message_carry->h_lut_indexes;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
current_columns.next_accumulation(h_pbs_indexes_in, h_pbs_indexes_out,
|
||||
h_lut_indexes, total_ciphertexts,
|
||||
total_messages, needs_processing);
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_pbs_indexes_in, h_pbs_indexes_out);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
luts_message_carry->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0], true);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
uint32_t total_ciphertexts;
|
||||
uint32_t total_messages;
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
needs_processing);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
@@ -400,10 +413,36 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
Torus *h_lwe_indexes_in_pinned;
|
||||
Torus *h_lwe_indexes_out_pinned;
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
|
||||
total_ciphertexts * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
|
||||
total_ciphertexts * sizeof(Torus));
|
||||
for (uint32_t i = 0; i < total_ciphertexts; i++) {
|
||||
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
|
||||
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
|
||||
}
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
|
||||
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < total_ciphertexts; i++) {
|
||||
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
|
||||
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
|
||||
}
|
||||
cudaFreeHost(h_lwe_indexes_in_pinned);
|
||||
cudaFreeHost(h_lwe_indexes_out_pinned);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks,
|
||||
current_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_message_carry, total_ciphertexts);
|
||||
streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
|
||||
ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
std::swap(d_columns, d_new_columns);
|
||||
@@ -417,22 +456,12 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
auto h_pbs_indexes_in = mem_ptr->luts_message_carry->h_lwe_indexes_in;
|
||||
auto h_pbs_indexes_out = mem_ptr->luts_message_carry->h_lwe_indexes_out;
|
||||
auto h_lut_indexes = mem_ptr->luts_message_carry->h_lut_indexes;
|
||||
for (uint i = 0; i < 2 * num_radix_blocks; i++) {
|
||||
h_pbs_indexes_in[i] = i % num_radix_blocks;
|
||||
h_pbs_indexes_out[i] = i + i / num_radix_blocks;
|
||||
h_lut_indexes[i] = i / num_radix_blocks;
|
||||
}
|
||||
mem_ptr->luts_message_carry->set_lwe_indexes(
|
||||
streams[0], gpu_indexes[0], h_pbs_indexes_in, h_pbs_indexes_out);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
luts_message_carry->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0], true);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
prepare_final_pbs_indexes<Torus>
|
||||
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
|
||||
@@ -459,10 +488,38 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
|
||||
Torus *h_lwe_indexes_in_pinned;
|
||||
Torus *h_lwe_indexes_out_pinned;
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
|
||||
num_blocks_in_apply_lut * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
|
||||
num_blocks_in_apply_lut * sizeof(Torus));
|
||||
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
|
||||
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
|
||||
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
|
||||
}
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
|
||||
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(
|
||||
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
|
||||
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
|
||||
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
|
||||
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
|
||||
}
|
||||
cudaFreeHost(h_lwe_indexes_in_pinned);
|
||||
cudaFreeHost(h_lwe_indexes_out_pinned);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
|
||||
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
|
||||
2 * num_radix_blocks);
|
||||
num_blocks_in_apply_lut);
|
||||
}
|
||||
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
|
||||
num_radix_blocks, num_radix_in_vec, chunk_size,
|
||||
@@ -474,8 +531,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
|
||||
current_blocks, ¤t_blocks_slice,
|
||||
num_radix_blocks, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -106,8 +106,6 @@ __host__ void host_integer_radix_negation(
|
||||
|
||||
lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
|
||||
lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
|
||||
CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
|
||||
carry_modulus);
|
||||
zb = z / message_modulus;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
|
||||
num_clear_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
|
||||
|
||||
@@ -154,7 +154,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
@@ -253,7 +253,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
@@ -287,7 +287,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true);
|
||||
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes);
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
@@ -434,7 +434,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
@@ -540,7 +540,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f, true);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -589,7 +589,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true);
|
||||
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes);
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
@@ -819,7 +819,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
|
||||
@@ -136,8 +136,7 @@ template <typename T>
|
||||
__host__ void host_integer_small_scalar_mul_radix(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
|
||||
CudaRadixCiphertextFFI *input_lwe_array, T scalar,
|
||||
const uint32_t message_modulus, const uint32_t carry_modulus) {
|
||||
CudaRadixCiphertextFFI *input_lwe_array, T scalar) {
|
||||
|
||||
if (output_lwe_array->num_radix_blocks != input_lwe_array->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
@@ -167,8 +166,6 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
output_lwe_array->noise_levels[i] =
|
||||
input_lwe_array->noise_levels[i] * scalar;
|
||||
output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
|
||||
CHECK_NOISE_LEVEL(output_lwe_array->noise_levels[i], message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -159,13 +159,11 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// control_bit|b|a
|
||||
host_pack_bivariate_blocks<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mux_inputs, mux_lut->lwe_indexes_out,
|
||||
rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
|
||||
mem->params.message_modulus, mem->params.carry_modulus);
|
||||
rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits);
|
||||
|
||||
// The shift bit is already properly aligned/positioned
|
||||
host_add_the_same_block_to_all_blocks<Torus>(
|
||||
streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit,
|
||||
mem->params.message_modulus, mem->params.carry_modulus);
|
||||
streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit);
|
||||
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
@@ -185,9 +183,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
|
||||
// Bitshift and add the other bits
|
||||
for (int i = bits_per_block - 2; i >= 0; i--) {
|
||||
host_integer_small_scalar_mul_radix<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, 2,
|
||||
mem->params.message_modulus, mem->params.carry_modulus);
|
||||
host_integer_small_scalar_mul_radix<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array, lwe_array, 2);
|
||||
for (int j = 0; j < num_radix_blocks; j++) {
|
||||
CudaRadixCiphertextFFI block;
|
||||
CudaRadixCiphertextFFI bit_to_add;
|
||||
@@ -196,8 +193,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
i + j * bits_per_block,
|
||||
i + j * bits_per_block + 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
|
||||
&bit_to_add, 1, mem->params.message_modulus,
|
||||
mem->params.carry_modulus);
|
||||
&bit_to_add, 1);
|
||||
}
|
||||
|
||||
// To give back a clean ciphertext
|
||||
|
||||
@@ -73,7 +73,6 @@ __host__ void host_integer_radix_subtraction(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in_2,
|
||||
message_modulus, carry_modulus, num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_out,
|
||||
lwe_array_in_1, num_radix_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
lwe_array_in_1, num_radix_blocks);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -10,7 +10,7 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
|
||||
output->num_radix_blocks != input_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
|
||||
input_1, input_2, output->num_radix_blocks, 0, 0);
|
||||
input_1, input_2, output->num_radix_blocks);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -48,7 +48,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
|
||||
output->num_radix_blocks != input_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
|
||||
input_1, input_2, output->num_radix_blocks, 0, 0);
|
||||
input_1, input_2, output->num_radix_blocks);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "integer/integer.h"
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "linear_algebra.h"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <stdio.h>
|
||||
@@ -103,12 +102,11 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
|
||||
// Coefficient-wise addition
|
||||
// num_radix_blocks selects the amount of blocks to be added from the inputs
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_addition(cudaStream_t stream, uint32_t gpu_index,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input_1,
|
||||
CudaRadixCiphertextFFI const *input_2, uint32_t num_radix_blocks,
|
||||
const uint32_t message_modulus, const uint32_t carry_modulus) {
|
||||
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input_1,
|
||||
CudaRadixCiphertextFFI const *input_2,
|
||||
uint32_t num_radix_blocks) {
|
||||
if (output->lwe_dimension != input_1->lwe_dimension ||
|
||||
output->lwe_dimension != input_2->lwe_dimension)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
@@ -137,7 +135,6 @@ host_addition(cudaStream_t stream, uint32_t gpu_index,
|
||||
output->degrees[i] = input_1->degrees[i] + input_2->degrees[i];
|
||||
output->noise_levels[i] =
|
||||
input_1->noise_levels[i] + input_2->noise_levels[i];
|
||||
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,8 +160,7 @@ template <typename T>
|
||||
__host__ void host_add_the_same_block_to_all_blocks(
|
||||
cudaStream_t stream, uint32_t gpu_index, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input_with_multiple_blocks,
|
||||
CudaRadixCiphertextFFI const *input_with_single_block,
|
||||
const uint32_t message_modulus, const uint32_t carry_modulus) {
|
||||
CudaRadixCiphertextFFI const *input_with_single_block) {
|
||||
if (output->num_radix_blocks != input_with_multiple_blocks->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
if (input_with_single_block->num_radix_blocks != 1)
|
||||
@@ -196,7 +192,6 @@ __host__ void host_add_the_same_block_to_all_blocks(
|
||||
input_with_single_block->degrees[0];
|
||||
output->noise_levels[i] = input_with_multiple_blocks->noise_levels[i] +
|
||||
input_with_single_block->noise_levels[0];
|
||||
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,7 +380,6 @@ __host__ void host_unchecked_sub_with_correcting_term(
|
||||
output->noise_levels[i] =
|
||||
input_1->noise_levels[i] + input_2->noise_levels[i];
|
||||
zb = z / message_modulus;
|
||||
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -53,7 +53,6 @@
|
||||
* [Zero-knowledge proofs](fhe-computation/advanced-features/zk-pok.md)
|
||||
* [Multi-threading with Rayon crate](fhe-computation/advanced-features/rayon-crate.md)
|
||||
* [Noise squashing](fhe-computation/advanced-features/noise-squashing.md)
|
||||
* [Key upgrade](fhe-computation/advanced-features/upgrade-key-chain.md)
|
||||
* [Tooling](fhe-computation/tooling/README.md)
|
||||
* [PBS statistics](fhe-computation/tooling/pbs-stats.md)
|
||||
* [Generic trait bounds](fhe-computation/tooling/trait-bounds.md)
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
# Upgrade Key Chain
|
||||
|
||||
This document describes how one can use the `UpgradeKeyChain` to be able to
|
||||
easily upgrade a ciphertext that is under older parameters to newer parameters.
|
||||
|
||||
It is different and complementary to the data versioning feature, as the
|
||||
data versioning feature allows loading ciphertexts generated
|
||||
with a previous TFHE-rs version if the ciphertext structurally changed.
|
||||
|
||||
|
||||
The `UpgradeKeyChain` first needs to know about possible parameters, for that,
|
||||
`add_key_set` should be called with all the different server keys.
|
||||
Note that the `Tag` of the keys is used to differentiate them.
|
||||
|
||||
Then, the `UpgradeKeyChain` requires upgrade keys to be able to upgrade ciphertexts,
|
||||
there are two types of these keys:
|
||||
- `KeySwitchingKey` to upgrade a FheUint/FheInt/FheBool to another FheUint/FheInt/FheBool with different parameters
|
||||
- `DecompressionUpgradeKey` to upgrade ciphertexts from a `CompressedCiphertextList` to FheUint/FheInt/FheBool with different parameters
|
||||
|
||||
|
||||
```rust
|
||||
use tfhe::shortint::parameters::{
|
||||
COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::{ConfigBuilder, set_server_key, ServerKey, ClientKey, FheUint32, KeySwitchingKey, Device};
|
||||
use tfhe::upgrade::UpgradeKeyChain;
|
||||
|
||||
fn main() {
|
||||
let compute_params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
let compression_parameters = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
let config = ConfigBuilder::with_custom_parameters(compute_params)
|
||||
.enable_compression(compression_parameters)
|
||||
.build();
|
||||
|
||||
let (cks_1, sks_1) = {
|
||||
let mut ck = ClientKey::generate(config);
|
||||
ck.tag_mut().set_u64(1);
|
||||
let sk = ServerKey::new(&ck);
|
||||
(ck, sk)
|
||||
};
|
||||
|
||||
let (cks_2, sks_2) = {
|
||||
let mut ck = ClientKey::generate(config);
|
||||
ck.tag_mut().set_u64(2);
|
||||
let sk = ServerKey::new(&ck);
|
||||
(ck, sk)
|
||||
};
|
||||
|
||||
// Create a ksk that upgrades from the first key, to the second key
|
||||
let ksk = KeySwitchingKey::with_parameters(
|
||||
(&cks_1, &sks_1),
|
||||
(&cks_2, &sks_2),
|
||||
PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
);
|
||||
|
||||
let mut upgrader = UpgradeKeyChain::default();
|
||||
// First, add the server keys
|
||||
// to register the different possible parameters
|
||||
upgrader.add_key_set(&sks_1);
|
||||
upgrader.add_key_set(&sks_2);
|
||||
// Add our upgrade key
|
||||
upgrader.add_upgrade_key(ksk).unwrap();
|
||||
|
||||
|
||||
let clear_a = rand::random::<u32>();
|
||||
let clear_b = rand::random::<u32>();
|
||||
|
||||
let a = FheUint32::encrypt(clear_a, &cks_1);
|
||||
let b = FheUint32::encrypt(clear_b, &cks_1);
|
||||
|
||||
let upgraded_a = upgrader
|
||||
.upgrade(&a, sks_2.tag(), Device::Cpu)
|
||||
.unwrap();
|
||||
|
||||
let upgraded_b = upgrader
|
||||
.upgrade(&b, sks_2.tag(), Device::Cpu)
|
||||
.unwrap();
|
||||
|
||||
set_server_key(sks_2.clone());
|
||||
let c = upgraded_a + upgraded_b;
|
||||
let dc: u32 = c.decrypt(&cks_2);
|
||||
assert_eq!(dc, clear_a.wrapping_add(clear_b));
|
||||
}
|
||||
```
|
||||
@@ -33,7 +33,7 @@ macro_rules! izip {
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr)) => { |(((a, b), c), d)| (a, b, c, d) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr)) => { |((((a, b), c), d), e)| (a, b, c, d, e) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f:expr)) => { |(((((a, b), c), d), e), f)| (a, b, c, d, e, f) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f:expr, $g:expr)) => { |((((((a, b), c), d), e), f), g)| (a, b, c, d, e, f, g) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f:expr, $g:expr)) => { |((((((a, b), c), d), e), f), g)| (a, b, c, d, e, f, e) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f:expr, $g:expr, $h:expr)) => { |(((((((a, b), c), d), e), f), g), h)| (a, b, c, d, e, f, g, h) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f:expr, $g:expr, $h:expr, $i: expr)) => { |((((((((a, b), c), d), e), f), g), h), i)| (a, b, c, d, e, f, g, h, i) };
|
||||
(@ __closure @ ($a:expr, $b:expr, $c:expr, $d:expr, $e: expr, $f:expr, $g:expr, $h:expr, $i: expr, $j: expr)) => { |(((((((((a, b), c), d), e), f), g), h), i), j)| (a, b, c, d, e, f, g, h, i, j) };
|
||||
|
||||
@@ -7,7 +7,7 @@ use crate::high_level_api::array::{ArrayBackend, BackendDataContainer, BackendDa
|
||||
use crate::high_level_api::global_state;
|
||||
use crate::integer::BooleanBlock;
|
||||
use crate::prelude::{FheDecrypt, FheTryEncrypt};
|
||||
use crate::{ClientKey, FheId};
|
||||
use crate::{ClientKey, FheBool, FheId, Tag};
|
||||
use rayon::prelude::*;
|
||||
use std::ops::RangeBounds;
|
||||
|
||||
@@ -36,6 +36,28 @@ impl ArrayBackend for CpuFheBoolArrayBackend {
|
||||
type Owned = Vec<BooleanBlock>;
|
||||
}
|
||||
|
||||
impl From<Vec<FheBool>> for CpuFheBoolArray {
|
||||
fn from(value: Vec<FheBool>) -> Self {
|
||||
let vec = value
|
||||
.into_iter()
|
||||
.map(|b| BooleanBlock::new_unchecked(b.into_raw_parts()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let shape = vec![vec.len()];
|
||||
Self::new(vec, shape)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CpuFheBoolArray> for Vec<FheBool> {
|
||||
fn from(value: CpuFheBoolArray) -> Self {
|
||||
value
|
||||
.into_container()
|
||||
.into_iter()
|
||||
.map(|boolean_block| FheBool::new(boolean_block, Tag::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendDataContainer for &[BooleanBlock] {
|
||||
type Backend = CpuFheBoolArrayBackend;
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocab
|
||||
use crate::integer::server_key::{Reciprocable, ScalarMultiplier};
|
||||
use crate::integer::{IntegerRadixCiphertext, RadixCiphertext, SignedRadixCiphertext};
|
||||
use crate::prelude::{FheDecrypt, FheTryEncrypt};
|
||||
use crate::{ClientKey, Error};
|
||||
use crate::{ClientKey, Error, FheInt, FheUint, Tag};
|
||||
use rayon::prelude::*;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::RangeBounds;
|
||||
@@ -54,6 +54,48 @@ where
|
||||
type Owned = Vec<T>;
|
||||
}
|
||||
|
||||
impl<Id: FheUintId> From<Vec<FheUint<Id>>> for CpuFheUintArray<Id> {
|
||||
fn from(value: Vec<FheUint<Id>>) -> Self {
|
||||
let vec: Vec<_> = value
|
||||
.into_iter()
|
||||
.map(|uint| uint.into_raw_parts().0)
|
||||
.collect();
|
||||
let shape = vec![vec.len()];
|
||||
Self::new(vec, shape)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheUintId> From<CpuFheUintArray<Id>> for Vec<FheUint<Id>> {
|
||||
fn from(value: CpuFheUintArray<Id>) -> Self {
|
||||
value
|
||||
.into_container()
|
||||
.into_iter()
|
||||
.map(|radix| FheUint::new(radix, Tag::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheIntId> From<Vec<FheInt<Id>>> for CpuFheIntArray<Id> {
|
||||
fn from(value: Vec<FheInt<Id>>) -> Self {
|
||||
let vec: Vec<_> = value
|
||||
.into_iter()
|
||||
.map(|uint| uint.into_raw_parts().0)
|
||||
.collect();
|
||||
let shape = vec![vec.len()];
|
||||
Self::new(vec, shape)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheIntId> From<CpuFheIntArray<Id>> for Vec<FheInt<Id>> {
|
||||
fn from(value: CpuFheIntArray<Id>) -> Self {
|
||||
value
|
||||
.into_container()
|
||||
.into_iter()
|
||||
.map(|radix| FheInt::new(radix, Tag::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[track_caller]
|
||||
fn par_map_sks_op_on_pair_of_elements<'a, T, F>(
|
||||
|
||||
@@ -11,7 +11,7 @@ use super::super::{FheBackendArray, FheBackendArraySlice, FheBackendArraySliceMu
|
||||
use crate::array::traits::TensorSlice;
|
||||
use crate::integer::BooleanBlock;
|
||||
use crate::prelude::{FheDecrypt, FheTryEncrypt};
|
||||
use crate::{ClientKey, Device};
|
||||
use crate::{ClientKey, CpuFheBoolArray, Device, FheBool};
|
||||
use std::borrow::{Borrow, Cow};
|
||||
use std::ops::RangeBounds;
|
||||
|
||||
@@ -33,6 +33,43 @@ impl ArrayBackend for DynFheBoolArrayBackend {
|
||||
type Owned = InnerBoolArray;
|
||||
}
|
||||
|
||||
impl TryFrom<Vec<FheBool>> for FheBoolArray {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn try_from(values: Vec<FheBool>) -> Result<Self, Self::Error> {
|
||||
if values.is_empty() {
|
||||
return Ok(Self::new(InnerBoolArray::Cpu(vec![]), vec![0]));
|
||||
}
|
||||
|
||||
let shape = vec![values.len()];
|
||||
let device_of_first = values[0].current_device();
|
||||
let inner = match device_of_first {
|
||||
Device::Cpu => {
|
||||
let new_values = values
|
||||
.into_iter()
|
||||
.map(|value| value.ciphertext.into_cpu())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
InnerBoolArray::Cpu(new_values)
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
Device::CudaGpu => return Err(crate::error!("Array do not support GPU")),
|
||||
#[cfg(feature = "hpu")]
|
||||
Device::Hpu => return Err(crate::error!("Array do not support HPU")),
|
||||
};
|
||||
|
||||
Ok(Self::new(inner, shape))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CpuFheBoolArray> for FheBoolArray {
|
||||
fn from(cpu_array: CpuFheBoolArray) -> Self {
|
||||
let CpuFheBoolArray { elems, dims, _id } = cpu_array;
|
||||
|
||||
Self::new(InnerBoolArray::Cpu(elems), dims)
|
||||
}
|
||||
}
|
||||
|
||||
impl BitwiseArrayBackend for DynFheBoolArrayBackend {
|
||||
fn bitand<'a>(
|
||||
lhs: TensorSlice<'_, Self::Slice<'a>>,
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::high_level_api::array::{ArrayBackend, BackendDataContainer, BackendDa
|
||||
use crate::high_level_api::global_state;
|
||||
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::prelude::{FheDecrypt, FheTryEncrypt};
|
||||
use crate::{ClientKey, FheBoolId};
|
||||
use crate::{ClientKey, FheBool, FheBoolId, Tag};
|
||||
use rayon::prelude::*;
|
||||
use std::ops::RangeBounds;
|
||||
|
||||
@@ -72,6 +72,31 @@ impl From<Vec<CudaBooleanBlock>> for GpuBooleanOwned {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<FheBool>> for GpuFheBoolArray {
|
||||
fn from(value: Vec<FheBool>) -> Self {
|
||||
let container = with_cuda_internal_keys(|cuda_keys| {
|
||||
value
|
||||
.into_iter()
|
||||
.map(|val| val.ciphertext.into_gpu(&cuda_keys.streams))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
let shape = vec![container.len()];
|
||||
Self::new(container, shape)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<GpuFheBoolArray> for Vec<FheBool> {
|
||||
fn from(value: GpuFheBoolArray) -> Self {
|
||||
value
|
||||
.into_container()
|
||||
.0
|
||||
.into_iter()
|
||||
.map(|cuda_bool_block| FheBool::new(cuda_bool_block, Tag::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendDataContainer for GpuBooleanSlice<'_> {
|
||||
type Backend = GpuFheBoolArrayBackend;
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ use crate::integer::gpu::ciphertext::{
|
||||
use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
|
||||
use crate::integer::server_key::{Reciprocable, ScalarMultiplier};
|
||||
use crate::prelude::{CastInto, FheDecrypt, FheTryEncrypt};
|
||||
use crate::{ClientKey, Error};
|
||||
use crate::{ClientKey, Error, FheInt, FheUint, Tag};
|
||||
use rayon::prelude::*;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::RangeBounds;
|
||||
@@ -60,6 +60,54 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheUintId> From<Vec<FheUint<Id>>> for GpuFheUintArray<Id> {
|
||||
fn from(value: Vec<FheUint<Id>>) -> Self {
|
||||
let container = with_cuda_internal_keys(|cuda_keys| {
|
||||
value
|
||||
.into_iter()
|
||||
.map(|elem| elem.ciphertext.into_gpu(&cuda_keys.streams))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
let shape = vec![container.len()];
|
||||
Self::new(container, shape)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheUintId> From<GpuFheUintArray<Id>> for Vec<FheUint<Id>> {
|
||||
fn from(value: GpuFheUintArray<Id>) -> Self {
|
||||
value
|
||||
.into_container()
|
||||
.0
|
||||
.into_iter()
|
||||
.map(|elem| FheUint::new(elem, Tag::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheIntId> From<Vec<FheInt<Id>>> for GpuFheIntArray<Id> {
|
||||
fn from(value: Vec<FheInt<Id>>) -> Self {
|
||||
let container = with_cuda_internal_keys(|cuda_keys| {
|
||||
value
|
||||
.into_iter()
|
||||
.map(|elem| elem.ciphertext.into_gpu(&cuda_keys.streams))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
let shape = vec![container.len()];
|
||||
Self::new(container, shape)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: FheIntId> From<GpuFheIntArray<Id>> for Vec<FheInt<Id>> {
|
||||
fn from(value: GpuFheIntArray<Id>) -> Self {
|
||||
value
|
||||
.into_container()
|
||||
.0
|
||||
.into_iter()
|
||||
.map(|elem| FheInt::new(elem, Tag::default()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> ArrayBackend for GpuIntegerArrayBackend<T>
|
||||
where
|
||||
T: CudaIntegerRadixCiphertext,
|
||||
|
||||
@@ -2,7 +2,11 @@ mod booleans;
|
||||
mod signed;
|
||||
mod unsigned;
|
||||
|
||||
use crate::{generate_keys, set_server_key, ClientKey, ConfigBuilder, FheId};
|
||||
use crate::prelude::*;
|
||||
use crate::{
|
||||
generate_keys, set_server_key, ClientKey, ConfigBuilder, CpuFheBoolArray, CpuFheInt32Array,
|
||||
CpuFheUint32Array, FheBool, FheId, FheInt32, FheUint32,
|
||||
};
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::{Config, CudaServerKey};
|
||||
use rand::distributions::{Distribution, Standard};
|
||||
@@ -11,6 +15,8 @@ use std::fmt::Debug;
|
||||
|
||||
use crate::array::traits::IOwnedArray;
|
||||
use crate::array::ClearArray;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::array::{GpuFheBoolArray, GpuFheInt32Array, GpuFheUint32Array};
|
||||
use crate::high_level_api::array::{FheBackendArray, FheBackendArraySlice};
|
||||
use crate::prelude::{FheDecrypt, FheTryEncrypt};
|
||||
use std::ops::{BitAnd, BitOr, BitXor};
|
||||
@@ -293,3 +299,132 @@ where
|
||||
assert_eq!(result, expected_result);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
#[test]
|
||||
fn test_gpu_conversions() {
|
||||
let ck = setup_default_gpu();
|
||||
|
||||
let num_values = 50;
|
||||
|
||||
// Vec<FheUint> <=> GpuFheUint
|
||||
{
|
||||
let clears = draw_random_values::<u32>(num_values);
|
||||
let encrypted = clears
|
||||
.iter()
|
||||
.map(|v| FheUint32::encrypt(*v, &ck))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let gpu_array = GpuFheUint32Array::from(encrypted);
|
||||
let decrypted: Vec<u32> = gpu_array.decrypt(&ck);
|
||||
assert_eq!(decrypted, clears);
|
||||
|
||||
let encrypted = Vec::<FheUint32>::from(gpu_array);
|
||||
let decrypted: Vec<u32> = encrypted
|
||||
.iter()
|
||||
.map(|fheuint| fheuint.decrypt(&ck))
|
||||
.collect();
|
||||
assert_eq!(decrypted, clears);
|
||||
}
|
||||
|
||||
// Vec<FheInt> <=> GpuFheInt
|
||||
{
|
||||
let clears = draw_random_values::<i32>(num_values);
|
||||
let encrypted = clears
|
||||
.iter()
|
||||
.map(|v| FheInt32::encrypt(*v, &ck))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let gpu_array = GpuFheInt32Array::from(encrypted);
|
||||
let decrypted: Vec<i32> = gpu_array.decrypt(&ck);
|
||||
assert_eq!(decrypted, clears);
|
||||
|
||||
let encrypted = Vec::<FheInt32>::from(gpu_array);
|
||||
let decrypted: Vec<i32> = encrypted.iter().map(|fheint| fheint.decrypt(&ck)).collect();
|
||||
assert_eq!(decrypted, clears);
|
||||
}
|
||||
|
||||
// Vec<FheBool> <=> GpuFheBool
|
||||
{
|
||||
let clears = draw_random_values::<bool>(num_values);
|
||||
let encrypted = clears
|
||||
.iter()
|
||||
.map(|v| FheBool::encrypt(*v, &ck))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let gpu_array = GpuFheBoolArray::from(encrypted);
|
||||
let decrypted: Vec<bool> = gpu_array.decrypt(&ck);
|
||||
assert_eq!(decrypted, clears);
|
||||
|
||||
let encrypted = Vec::<FheBool>::from(gpu_array);
|
||||
let decrypted: Vec<bool> = encrypted
|
||||
.iter()
|
||||
.map(|fhebool| fhebool.decrypt(&ck))
|
||||
.collect();
|
||||
assert_eq!(decrypted, clears);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cpu_conversions() {
|
||||
let ck = setup_default_cpu();
|
||||
|
||||
let num_values = 50;
|
||||
|
||||
// Vec<FheUint> <=> CpuFheUint
|
||||
{
|
||||
let clears = draw_random_values::<u32>(num_values);
|
||||
let encrypted = clears
|
||||
.iter()
|
||||
.map(|v| FheUint32::encrypt(*v, &ck))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let cpu_array = CpuFheUint32Array::from(encrypted);
|
||||
let decrypted: Vec<u32> = cpu_array.decrypt(&ck);
|
||||
assert_eq!(decrypted, clears);
|
||||
|
||||
let encrypted = Vec::<FheUint32>::from(cpu_array);
|
||||
let decrypted: Vec<u32> = encrypted
|
||||
.iter()
|
||||
.map(|fheuint| fheuint.decrypt(&ck))
|
||||
.collect();
|
||||
assert_eq!(decrypted, clears);
|
||||
}
|
||||
|
||||
// Vec<FheInt> <=> CpuFheInt
|
||||
{
|
||||
let clears = draw_random_values::<i32>(num_values);
|
||||
let encrypted = clears
|
||||
.iter()
|
||||
.map(|v| FheInt32::encrypt(*v, &ck))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let cpu_array = CpuFheInt32Array::from(encrypted);
|
||||
let decrypted: Vec<i32> = cpu_array.decrypt(&ck);
|
||||
assert_eq!(decrypted, clears);
|
||||
|
||||
let encrypted = Vec::<FheInt32>::from(cpu_array);
|
||||
let decrypted: Vec<i32> = encrypted.iter().map(|fheint| fheint.decrypt(&ck)).collect();
|
||||
assert_eq!(decrypted, clears);
|
||||
}
|
||||
|
||||
// Vec<FheBool> <=> CpuFheBool
|
||||
{
|
||||
let clears = draw_random_values::<bool>(num_values);
|
||||
let encrypted = clears
|
||||
.iter()
|
||||
.map(|v| FheBool::encrypt(*v, &ck))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let cpu_array = CpuFheBoolArray::from(encrypted);
|
||||
let decrypted: Vec<bool> = cpu_array.decrypt(&ck);
|
||||
assert_eq!(decrypted, clears);
|
||||
|
||||
let encrypted = Vec::<FheBool>::from(cpu_array);
|
||||
let decrypted: Vec<bool> = encrypted
|
||||
.iter()
|
||||
.map(|fhebool| fhebool.decrypt(&ck))
|
||||
.collect();
|
||||
assert_eq!(decrypted, clears);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::integer::hpu::ciphertext::HpuRadixCiphertext;
|
||||
pub(in crate::high_level_api) enum InnerBoolean {
|
||||
Cpu(BooleanBlock),
|
||||
#[cfg(feature = "gpu")]
|
||||
Cuda(crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock),
|
||||
Cuda(CudaBooleanBlock),
|
||||
#[cfg(feature = "hpu")]
|
||||
Hpu(HpuRadixCiphertext),
|
||||
}
|
||||
@@ -220,7 +220,6 @@ impl InnerBoolean {
|
||||
&mut cuda_ct.0
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub(crate) fn into_cpu(self) -> BooleanBlock {
|
||||
match self {
|
||||
Self::Cpu(cpu_ct) => cpu_ct,
|
||||
|
||||
@@ -69,14 +69,6 @@ impl CompressedNoiseSquashingCompressionKey {
|
||||
let key = self.key.decompress();
|
||||
NoiseSquashingCompressionKey { key }
|
||||
}
|
||||
|
||||
pub fn from_raw_parts(key: ShortintCompressedNoiseSquashingCompressionKey) -> Self {
|
||||
Self { key }
|
||||
}
|
||||
|
||||
pub fn into_raw_pars(self) -> ShortintCompressedNoiseSquashingCompressionKey {
|
||||
self.key
|
||||
}
|
||||
}
|
||||
|
||||
impl Named for CompressedNoiseSquashingCompressionKey {
|
||||
@@ -89,16 +81,6 @@ pub struct NoiseSquashingCompressionKey {
|
||||
pub(crate) key: ShortintNoiseSquashingCompressionKey,
|
||||
}
|
||||
|
||||
impl NoiseSquashingCompressionKey {
|
||||
pub fn from_raw_parts(key: ShortintNoiseSquashingCompressionKey) -> Self {
|
||||
Self { key }
|
||||
}
|
||||
|
||||
pub fn into_raw_pars(self) -> ShortintNoiseSquashingCompressionKey {
|
||||
self.key
|
||||
}
|
||||
}
|
||||
|
||||
impl Named for NoiseSquashingCompressionKey {
|
||||
const NAME: &'static str = "integer::NoiseSquashingCompressionKey";
|
||||
}
|
||||
|
||||
@@ -208,37 +208,6 @@ impl CompressedNoiseSquashingCompressionKey {
|
||||
lwe_per_glwe: self.lwe_per_glwe,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct from raw parts
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if lwe_per_glwe is greater than the output polynomial size of the packing key
|
||||
/// switching key
|
||||
pub fn from_raw_parts(
|
||||
packing_key_switching_key: SeededLwePackingKeyswitchKey<Vec<u128>>,
|
||||
lwe_per_glwe: LweCiphertextCount,
|
||||
) -> Self {
|
||||
assert!(
|
||||
lwe_per_glwe.0 <= packing_key_switching_key.output_polynomial_size().0,
|
||||
"Cannot pack more than polynomial_size(={}) elements per glwe, {} requested",
|
||||
packing_key_switching_key.output_polynomial_size().0,
|
||||
lwe_per_glwe.0,
|
||||
);
|
||||
Self {
|
||||
packing_key_switching_key,
|
||||
lwe_per_glwe,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_raw_parts(self) -> (SeededLwePackingKeyswitchKey<Vec<u128>>, LweCiphertextCount) {
|
||||
let Self {
|
||||
packing_key_switching_key,
|
||||
lwe_per_glwe,
|
||||
} = self;
|
||||
|
||||
(packing_key_switching_key, lwe_per_glwe)
|
||||
}
|
||||
}
|
||||
|
||||
impl ParameterSetConformant for CompressedNoiseSquashingCompressionKey {
|
||||
|
||||
@@ -246,39 +246,6 @@ pub struct NoiseSquashingCompressionKey {
|
||||
pub(super) lwe_per_glwe: LweCiphertextCount,
|
||||
}
|
||||
|
||||
impl NoiseSquashingCompressionKey {
|
||||
/// Construct from raw parts
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if lwe_per_glwe is greater than the output polynomial size of the packing key
|
||||
/// switching key
|
||||
pub fn from_raw_parts(
|
||||
packing_key_switching_key: LwePackingKeyswitchKey<Vec<u128>>,
|
||||
lwe_per_glwe: LweCiphertextCount,
|
||||
) -> Self {
|
||||
assert!(
|
||||
lwe_per_glwe.0 <= packing_key_switching_key.output_polynomial_size().0,
|
||||
"Cannot pack more than polynomial_size(={}) elements per glwe, {} requested",
|
||||
packing_key_switching_key.output_polynomial_size().0,
|
||||
lwe_per_glwe.0,
|
||||
);
|
||||
Self {
|
||||
packing_key_switching_key,
|
||||
lwe_per_glwe,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_raw_parts(self) -> (LwePackingKeyswitchKey<Vec<u128>>, LweCiphertextCount) {
|
||||
let Self {
|
||||
packing_key_switching_key,
|
||||
lwe_per_glwe,
|
||||
} = self;
|
||||
|
||||
(packing_key_switching_key, lwe_per_glwe)
|
||||
}
|
||||
}
|
||||
|
||||
impl NoiseSquashingPrivateKey {
|
||||
pub fn new_noise_squashing_compression_key(
|
||||
&self,
|
||||
|
||||
@@ -46,10 +46,6 @@ mod test_cpu_doc {
|
||||
"../docs/fhe-computation/advanced-features/zk-pok.md",
|
||||
advanced_features_zk_pok
|
||||
);
|
||||
doctest!(
|
||||
"../docs/fhe-computation/advanced-features/upgrade-key-chain.md",
|
||||
upgrade_key_chain
|
||||
);
|
||||
|
||||
// COMPUTE
|
||||
doctest!(
|
||||
|
||||
Reference in New Issue
Block a user