Compare commits

...

7 Commits

Author SHA1 Message Date
Agnes Leroy
071934532e add back functions 2025-06-04 12:53:45 +02:00
Agnes Leroy
87b41ebbe5 More prints degrees 2025-06-04 12:53:44 +02:00
Agnes Leroy
7ba7dd3e19 Debug with long run tests 2025-06-04 12:53:44 +02:00
Beka Barbakadze
7f122bb435 refactor(gpu): remove debug prints 2025-06-04 12:53:32 +02:00
Beka Barbakadze
56c0811751 fix(gpu): fix ilog2 bug 2025-06-04 12:53:32 +02:00
Beka Barbakadze
580ac5aa99 fix(gpu): mul not using mem reuse 2025-06-04 12:53:32 +02:00
Beka Barbakadze
cf34e9b7f0 refactor(gpu): refactor and optimize sum_ciphertext in cuda backend 2025-06-04 12:53:31 +02:00
19 changed files with 1000 additions and 528 deletions

View File

@@ -400,7 +400,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

View File

@@ -1116,18 +1116,116 @@ template <typename Torus> struct int_overflowing_sub_memory {
};
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
CudaRadixCiphertextFFI *new_blocks;
CudaRadixCiphertextFFI *new_blocks_copy;
CudaRadixCiphertextFFI *old_blocks;
CudaRadixCiphertextFFI *small_lwe_vector;
int_radix_params params;
int32_t *d_smart_copy_in;
int32_t *d_smart_copy_out;
bool mem_reuse = false;
size_t max_total_blocks_in_vec;
uint32_t num_blocks_in_radix;
uint32_t max_num_radix_in_vec;
uint64_t *size_tracker;
bool gpu_memory_allocated;
// temporary buffers
CudaRadixCiphertextFFI *current_blocks;
CudaRadixCiphertextFFI *small_lwe_vector;
uint32_t *d_columns_data;
uint32_t *d_columns_counter;
uint32_t **d_columns;
uint32_t *d_new_columns_data;
uint32_t *d_new_columns_counter;
uint32_t **d_new_columns;
uint64_t *d_degrees;
uint32_t *d_pbs_counters;
// lookup table for extracting message and carry
int_radix_lut<Torus> *luts_message_carry;
bool mem_reuse = false;
void setup_index_buffers(cudaStream_t const *streams,
uint32_t const *gpu_indexes) {
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
size_tracker, gpu_memory_allocated);
d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
gpu_memory_allocated);
auto num_blocks_in_radix = this->num_blocks_in_radix;
auto max_num_radix_in_vec = this->max_num_radix_in_vec;
auto setup_columns =
[num_blocks_in_radix, max_num_radix_in_vec, streams,
gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
uint32_t *&columns_counter, uint64_t *size_tracker,
bool gpu_memory_allocated) {
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * sizeof(uint32_t), streams[0],
gpu_indexes[0], size_tracker, gpu_memory_allocated);
cuda_memset_with_size_tracking_async(
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
streams[0], gpu_indexes[0], gpu_memory_allocated);
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
for (int i = 0; i < num_blocks_in_radix; ++i) {
h_columns[i] = columns_data + i * max_num_radix_in_vec;
}
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * sizeof(uint32_t *), streams[0],
gpu_indexes[0], size_tracker, gpu_memory_allocated);
cuda_memcpy_async_to_gpu(columns, h_columns,
num_blocks_in_radix * sizeof(uint32_t *),
streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete[] h_columns;
};
setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
gpu_memory_allocated);
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
size_tracker, gpu_memory_allocated);
}
void setup_lookup_tables(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
uint32_t message_modulus = params.message_modulus;
if (!mem_reuse) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
gpu_memory_allocated, size_tracker);
}
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0),
luts_message_carry->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc,
luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
}
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
@@ -1136,103 +1234,84 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
bool allocate_gpu_memory,
uint64_t *size_tracker) {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
this->mem_reuse = false;
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
this->num_blocks_in_radix = num_blocks_in_radix;
this->max_num_radix_in_vec = max_num_radix_in_vec;
this->gpu_memory_allocated = allocate_gpu_memory;
this->size_tracker = size_tracker;
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
setup_index_buffers(streams, gpu_indexes);
setup_lookup_tables(streams, gpu_indexes, gpu_count);
// allocate gpu memory for intermediate buffers
new_blocks = new CudaRadixCiphertextFFI;
// create and allocate intermediate buffers
current_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
new_blocks_copy = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
old_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
}
int_sum_ciphertexts_vec_memory(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
CudaRadixCiphertextFFI *old_blocks,
CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
CudaRadixCiphertextFFI *small_lwe_vector,
int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
uint64_t *size_tracker) {
mem_reuse = true;
gpu_memory_allocated = allocate_gpu_memory;
this->mem_reuse = true;
this->params = params;
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
this->num_blocks_in_radix = num_blocks_in_radix;
this->max_num_radix_in_vec = max_num_radix_in_vec;
this->gpu_memory_allocated = allocate_gpu_memory;
this->size_tracker = size_tracker;
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
// assign gpu memory for intermediate buffers
this->new_blocks = new_blocks;
this->old_blocks = old_blocks;
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
new_blocks_copy = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
this->luts_message_carry = reused_lut;
setup_index_buffers(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
gpu_indexes[0], gpu_memory_allocated);
if (!mem_reuse) {
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
small_lwe_vector, gpu_memory_allocated);
luts_message_carry->release(streams, gpu_indexes, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete new_blocks;
delete old_blocks;
delete current_blocks;
delete small_lwe_vector;
delete luts_message_carry;
}
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete new_blocks_copy;
}
};
// For sequential algorithm in group propagation
template <typename Torus> struct int_seq_group_prop_memory {
@@ -2549,7 +2628,7 @@ template <typename Torus> struct int_mul_memory {
// radix_lwe_left except the last blocks of each shift
int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
int total_block_count = num_radix_blocks * num_radix_blocks;
// allocate memory for intermediate buffers
vector_result_sb = new CudaRadixCiphertextFFI;
@@ -2562,13 +2641,13 @@ template <typename Torus> struct int_mul_memory {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
// create int_radix_lut objects for lsb, msb, message, carry
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
params, 2, total_block_count,
params, 2, 2 * total_block_count,
allocate_gpu_memory, size_tracker);
auto lsb_acc = luts_array->get_lut(0, 0);
auto msb_acc = luts_array->get_lut(0, 1);
@@ -2602,9 +2681,10 @@ template <typename Torus> struct int_mul_memory {
luts_array->broadcast_lut(streams, gpu_indexes, 0);
// create memory object for sum ciphertexts
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
allocate_gpu_memory, size_tracker);
uint32_t uses_carry = 0;
uint32_t requested_flag = outputFlag::FLAG_NONE;

View File

@@ -520,8 +520,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
if (num_radix_blocks > lut->num_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of lut radix blocks")
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_in->num_radix_blocks)
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of input & output radix blocks")

View File

@@ -226,7 +226,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
CudaRadixCiphertextFFI *radix_lwe_vec, bool reduce_degrees_for_single_carry_propagation,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
@@ -234,64 +235,59 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
// the general case
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
}
switch (mem->params.polynomial_size) {
case 512:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 1024:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 2048:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 4096:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 8192:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 16384:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "

View File

@@ -20,28 +20,11 @@
#include <fstream>
#include <iostream>
#include <omp.h>
#include <queue>
#include <sstream>
#include <string>
#include <vector>
template <typename Torus>
__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
int32_t *id_in, size_t lwe_size) {
size_t tid = threadIdx.x;
size_t b_id = blockIdx.x;
size_t stride = blockDim.x;
auto input_id = id_in[b_id];
auto output_id = id_out[b_id];
auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
auto cur_dst = &dst[output_id * lwe_size];
for (int i = tid; i < lwe_size; i += stride) {
cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
}
}
template <typename Torus, class params>
__global__ void
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -94,33 +77,173 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
}
}
template <typename Torus>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {
__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
uint32_t *const columns_counter,
const uint64_t *const degrees,
const uint32_t num_radix_blocks,
const uint32_t num_radix_in_vec) {
size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
size_t radix_elem_size = num_blocks * block_size;
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
size_t block_stride = blockIdx.y * block_size;
auto result = &dst_radix[block_stride];
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = tid; i < block_size; i += stride) {
result[i] = src_chunk[block_stride + i];
}
// accumulate rest of the radixes
for (int r_id = 1; r_id < chunk_size; r_id++) {
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
for (int i = tid; i < block_size; i += stride) {
result[i] += cur_src_radix[block_stride + i];
const uint32_t idx = threadIdx.x;
size_t cnt = 0;
for (int i = 0; i < num_radix_in_vec; i++) {
size_t ct_id = i * num_radix_blocks + idx;
if (degrees[ct_id] != 0) {
columns[idx][cnt] = ct_id;
++cnt;
}
}
columns_counter[idx] = cnt;
}
template <typename Torus>
__global__ inline void prepare_new_columns_and_pbs_indexes(
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, uint32_t *const pbs_counters,
const uint32_t *const *const columns, const uint32_t *const columns_counter,
const uint32_t chunk_size) {
__shared__ uint32_t counter, sharedOr;
if (threadIdx.x == 0) {
counter = 0;
sharedOr = 0;
}
__syncthreads();
const uint32_t base_id = threadIdx.x;
const uint32_t column_len = columns_counter[base_id];
uint32_t ct_count = 0;
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
// those indexes are for message ciphertexts
// for message ciphertexts in and out index should be same
const uint32_t in_index = columns[base_id][i];
new_columns[base_id][ct_count] = in_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = in_index;
lut_indexes[pbs_index] = 0;
++ct_count;
}
// ct1 ct2 ct3
// pbs_indexes: 0, 1, 2
// pbs_indexes: 2, 1, 0
__syncthreads();
uint32_t message_count = counter;
if (base_id > 0) {
const uint32_t prev_base_id = base_id - 1;
const uint32_t prev_column_len = columns_counter[prev_base_id];
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
// those indexes are for carry ciphertexts
// for carry ciphertexts input is same as for message
// output will be placed to next block in the column
const uint32_t in_index = columns[prev_base_id][i];
const uint32_t out_index = columns[prev_base_id][i + 1];
new_columns[base_id][ct_count] = out_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = out_index;
lut_indexes[pbs_index] = 1;
++ct_count;
}
}
const uint32_t start_index = column_len - column_len % chunk_size;
for (uint32_t i = start_index; i < column_len; ++i) {
new_columns[base_id][ct_count] = columns[base_id][i];
++ct_count;
}
new_columns_counter[base_id] = ct_count;
if (ct_count > chunk_size) {
atomicOr(&sharedOr, 1);
}
__syncthreads();
if (threadIdx.x == 0) {
pbs_counters[0] = counter;
pbs_counters[1] = message_count;
pbs_counters[2] = sharedOr;
}
}
template <typename Torus>
__global__ inline void prepare_final_pbs_indexes(
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
int idx = threadIdx.x;
pbs_indexes_in[idx] = idx % num_radix_blocks;
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
lut_indexes[idx] = idx / num_radix_blocks;
}
template <typename Torus>
__global__ void calculate_chunks(Torus *const input_blocks,
const uint32_t *const *const columns,
const uint32_t *const columns_counter,
const uint32_t chunk_size,
const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
if (column_len >= chunk_size) {
const uint32_t num_chunks = column_len / chunk_size;
Torus result = 0;
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
const uint32_t cur_ct_id =
columns[base_id][chunk_id * chunk_size + ct_id];
result += input_blocks[cur_ct_id * block_size + coef_id];
}
input_blocks[first_ct_id * block_size + coef_id] = result;
}
}
}
template <typename Torus>
__global__ void calculate_final_chunk_into_radix(
Torus *const out_radix, const Torus *const input_blocks,
const uint32_t *const *const columns, const uint32_t *const columns_counter,
const uint32_t chunk_size, const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
Torus result = 0;
if (column_len) {
const uint32_t first_ct_id = columns[base_id][0];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t i = 1; i < column_len; ++i) {
const uint32_t cur_ct_it = columns[base_id][i];
result += input_blocks[cur_ct_it * block_size + coef_id];
}
}
out_radix[base_id * block_size + coef_id] = result;
}
template <typename Torus, class params>
@@ -167,6 +290,65 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
(process_msb) ? cur_msb_ct[params::degree] : 0;
}
}
inline bool at_least_one_column_needs_processing(
const uint64_t *const degrees, const uint32_t num_radix_blocks,
const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
std::vector<uint32_t> columns_count(num_radix_blocks, 0);
for (size_t column = 0; column < num_radix_blocks; ++column) {
for (size_t block = 0; block < num_radix_in_vec; ++block) {
const size_t block_index = block * num_radix_blocks + column;
if (degrees[block_index]) {
columns_count[column]++;
if (columns_count[column] > chunk_size) {
return true;
}
}
}
}
return false;
}
inline void calculate_final_degrees(uint64_t *const out_degrees,
const uint64_t *const input_degrees,
size_t num_blocks, size_t num_radix_in_vec,
size_t chunk_size,
uint64_t message_modulus) {
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
return std::min(message_modulus - 1, degree);
};
std::vector<std::queue<uint64_t>> columns(num_blocks);
for (size_t i = 0; i < num_radix_in_vec; ++i) {
for (size_t j = 0; j < num_blocks; ++j) {
if (input_degrees[i * num_blocks + j])
columns[j].push(input_degrees[i * num_blocks + j]);
}
}
for (size_t i = 0; i < num_blocks; ++i) {
auto &col = columns[i];
while (col.size() > 1) {
uint32_t cur_degree = 0;
size_t mn = std::min(chunk_size, col.size());
for (int j = 0; j < mn; ++j) {
cur_degree += col.front();
col.pop();
}
const uint64_t new_degree = get_degree(cur_degree);
col.push(new_degree);
if ((i + 1) < num_blocks) {
columns[i + 1].push(new_degree);
}
}
}
for (int i = 0; i < num_blocks; i++) {
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -181,15 +363,107 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
return size_tracker;
}
__global__ inline void DEBUG_PRINT_COLUMNS(uint32_t *const *const columns,
uint32_t *const columns_counter,
const uint32_t num_radix_blocks) {
printf("cuda_columns_counter:\n");
for (int i = 0; i < num_radix_blocks; i++) {
printf("%d ", columns_counter[i]);
}
printf("\n");
printf("cuda_columns:\n");
for (int i = 0; i < num_radix_blocks; i++) {
printf("column[%d]: ", i);
for (int j = 0; j < columns_counter[i]; j++)
{
printf("%d ", columns[i][j]);
}
printf("\n");
}
printf("\n");
}
__global__ inline void DEBUG_PRINT_COLUMNS_DATA(uint32_t *const *const columns,
uint32_t *const columns_counter,
uint64_t* data,
const uint32_t num_radix_blocks, size_t lwe_size) {
uint64_t delta = 576460752303423488ULL;
__syncthreads();
printf("cuda_new_columns:\n");
__syncthreads();
for (int i = 0; i < num_radix_blocks; i++) {
__syncthreads();
printf("column[%d]: ", i);
__syncthreads();
for (int j = 0; j < columns_counter[i]; j++)
{
__syncthreads();
auto cur_data =data[ columns[i][j] * lwe_size + lwe_size - 1];
cur_data /= delta;
printf("%llu ", cur_data);
__syncthreads();
}
__syncthreads();
printf("\n");
__syncthreads();
}
__syncthreads();
printf("\n");
__syncthreads();
}
template<typename Torus, bool input, bool clear>
__global__ inline void DEBUG_PRINT_PBS_DATA(Torus * data, Torus* input_indexes, Torus*
output_indexes, Torus *lut_indexes, size_t lwe_size, int num) {
printf("input_output_indexes: \n");
for (int i = 0; i < num; i++) {
auto input_val = data[input_indexes[i] * lwe_size + lwe_size -1];
auto output_val = data[output_indexes[i] * lwe_size + lwe_size -1];
auto val = input ? input_val : output_val;
auto val_clear = clear ? val / 576460752303423488ULL : val;
printf("%d %lu %lu %lu %lu %lu\n", i, input_indexes[i], output_indexes[i], lut_indexes[i],
val_clear, val);
}
}
//template<typename Torus>
//__global__ inline void DEBUG_PRINT_RADIX(Torus * data, size_t num_blocks, size_t lwe_size) {
// for (int i = 0; i < num_blocks; i++) {
// auto val = data[i * lwe_size + lwe_size - 1];
// auto val_clear = val / 576460752303423488ULL;
// printf("cuda_partial_sum_result: %lu %lu\n", val, val_clear);
// }
//}
template <typename Torus, class params>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
CudaRadixCiphertextFFI *terms, bool reduce_degrees_for_single_carry_propagation, void *const
*bsks, uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
int_radix_lut<Torus> *reused_lut) {
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
// cudaDeviceSynchronize();
// print_body<Torus>("cuda_input_partial_sum", (Torus*)terms->ptr, num_radix_blocks * num_radix_in_vec,
// 2048,
// 576460752303423488ULL);
// for (int i = 0; i <num_radix_blocks * num_radix_in_vec; i++ ) {
// printf("cuda_input_degrees: %d\n", terms->degrees[i]);
// }
// cudaDeviceSynchronize();
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -199,22 +473,28 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
PANIC("Cuda error: input vector does not have enough blocks")
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
PANIC("Cuda error: output does not have enough blocks")
auto new_blocks = mem_ptr->new_blocks;
auto new_blocks_copy = mem_ptr->new_blocks_copy;
auto old_blocks = mem_ptr->old_blocks;
auto current_blocks = mem_ptr->current_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
auto d_degrees = mem_ptr->d_degrees;
auto d_columns = mem_ptr->d_columns;
auto d_columns_counter = mem_ptr->d_columns_counter;
auto d_new_columns = mem_ptr->d_new_columns;
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
auto d_pbs_counters = mem_ptr->d_pbs_counters;
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
auto luts_message_carry = mem_ptr->luts_message_carry;
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto small_lwe_size = small_lwe_dimension + 1;
auto chunk_size =
(mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
(mem_ptr->params.message_modulus - 1);
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
// In the case of extracting a single LWE this parameters are dummy
uint32_t num_many_lut = 1;
@@ -228,244 +508,202 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
terms, 0, num_radix_blocks);
return;
}
if (old_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
terms);
}
if (num_radix_in_vec == 2) {
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
CudaRadixCiphertextFFI terms_slice;
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
&terms_slice, num_radix_blocks);
return;
}
size_t r = num_radix_in_vec;
size_t total_modulus = message_modulus * carry_modulus;
size_t message_max = message_modulus - 1;
size_t chunk_size = (total_modulus - 1) / message_max;
size_t h_lwe_idx_in[terms->num_radix_blocks];
size_t h_lwe_idx_out[terms->num_radix_blocks];
int32_t h_smart_copy_in[terms->num_radix_blocks];
int32_t h_smart_copy_out[terms->num_radix_blocks];
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
auto max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
int_radix_lut<Torus> *luts_message_carry;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
if (reused_lut == nullptr) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, true, nullptr);
} else {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
if (current_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
current_blocks, terms);
}
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
total_blocks_in_vec * sizeof(uint64_t), streams[0],
gpu_indexes[0]);
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_message, true);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry, true);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
cuda_set_device(gpu_indexes[0]);
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
num_radix_in_vec);
while (r > 2) {
size_t cur_total_blocks = r * num_radix_blocks;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_radix_blocks, 1);
bool needs_processing = at_least_one_column_needs_processing(
current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);
cuda_set_device(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
int number_of_threads = min(256, params::degree);
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
check_cuda_error(cudaGetLastError());
// h_pbs_counters[0] - total ciphertexts
// h_pbs_counters[1] - message ciphertexts
// h_pbs_counters[2] - at_least_one_column_needs_processing
uint32_t *h_pbs_counters;
cudaMallocHost((void **)&h_pbs_counters, 3 * sizeof(uint32_t));
if (mem_ptr->mem_reuse) {
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
}
size_t total_count = 0;
size_t message_count = 0;
size_t carry_count = 0;
size_t sm_copy_count = 0;
generate_ids_update_degrees(
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
message_max, total_count, message_count, carry_count, sm_copy_count);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
h_lwe_idx_in, h_lwe_idx_out);
while (needs_processing) {
// cudaDeviceSynchronize();
// DEBUG_PRINT_COLUMNS<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, num_radix_blocks);
// DEBUG_PRINT_COLUMNS_DATA<<<1, 1, 0, streams[0]>>>(d_columns, d_columns_counter, (uint64_t *)
// (current_blocks->ptr), num_radix_blocks, big_lwe_size);
calculate_chunks<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
chunk_size, big_lwe_size);
size_t copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
streams[0], gpu_indexes[0]);
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
d_pbs_counters, d_columns, d_columns_counter, chunk_size);
// inside d_smart_copy_in there are only -1 values
// it's fine to call smart_copy with same pointer
// as source and destination
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
new_blocks, 0, r * num_radix_blocks);
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
check_cuda_error(cudaGetLastError());
cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
3 * sizeof(uint32_t), streams[0], gpu_indexes[0]);
if (carry_count > 0)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_message_carry->get_lut_indexes(0, message_count), 1,
carry_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
const uint32_t total_ciphertexts = h_pbs_counters[0];
const uint32_t total_messages = h_pbs_counters[1];
needs_processing = (h_pbs_counters[2] != 0);
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
// DEBUG_PRINT_PBS_DATA<Torus, true, true><<<1, 1, 0, streams[0]>>>(
// (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
// luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
// );
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
std::vector<Torus *> small_lwe_vector_vec =
luts_message_carry->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec =
luts_message_carry->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
if (active_gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
big_lwe_size);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
total_messages);
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
small_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
small_lwe_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
luts_message_carry->lwe_indexes_in,
total_ciphertexts * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
luts_message_carry->lwe_indexes_out,
total_ciphertexts * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
for (uint i = 0; i < total_count; i++) {
auto degrees_index = luts_message_carry->h_lut_indexes[i];
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
luts_message_carry->using_trivial_lwe_indexes = false;
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, active_gpu_count, current_blocks,
current_blocks, bsks, ksks, ms_noise_reduction_key,
luts_message_carry, total_ciphertexts);
}
cuda_set_device(gpu_indexes[0]);
// DEBUG_PRINT_PBS_DATA<Torus, false, true><<<1, 1, 0, streams[0]>>>(
// (Torus *)(current_blocks->ptr), d_pbs_indexes_in, d_pbs_indexes_out,
// luts_message_carry->get_lut_indexes(0, 0), big_lwe_size, total_ciphertexts
// );
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
if (rem_blocks > 0)
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
new_blocks_created + rem_blocks, old_blocks,
cur_total_blocks - rem_blocks, cur_total_blocks);
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
std::swap(d_columns, d_new_columns);
std::swap(d_columns_counter, d_new_columns_counter);
}
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete (luts_message_carry);
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
cudaFreeHost(h_pbs_counters);
calculate_final_chunk_into_radix<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
d_columns, d_columns_counter, chunk_size, big_lwe_size);
if (reduce_degrees_for_single_carry_propagation) {
prepare_final_pbs_indexes<Torus><<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
d_pbs_indexes_in, d_pbs_indexes_out,
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
cuda_memset_async(
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
auto active_gpu_count = get_active_gpu_count(2 * num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, num_radix_blocks);
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
luts_message_carry->lwe_indexes_in,
2 * num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
luts_message_carry->lwe_indexes_out,
2 * num_radix_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
2 * num_radix_blocks);
}
// cudaDeviceSynchronize();
// print_body<Torus>("cuda_before_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
// 576460752303423488ULL);
// cudaDeviceSynchronize();
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
num_radix_blocks, num_radix_in_vec, chunk_size,
mem_ptr->params.message_modulus);
cuda_set_device(gpu_indexes[0]);
CudaRadixCiphertextFFI current_blocks_slice;
as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
current_blocks, &current_blocks_slice, num_radix_blocks);
// printf("add_happened\n");
}
// cudaDeviceSynchronize();
//
// print_body<Torus>("cuda_out_after_add", (Torus*)radix_lwe_out->ptr, num_radix_blocks, 2048,
// 576460752303423488ULL);
// cudaDeviceSynchronize();
}
template <typename Torus, class params>
@@ -599,14 +837,28 @@ __host__ void host_integer_mult_radix_kb(
size_t b_id = i % num_blocks;
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
}
for (int i = 0; i < num_blocks * 2 * num_blocks; i++)
{
auto cur_ptr = (Torus*)vector_result_sb->ptr;
cur_ptr += i * 2049 + 2048;
print_debug<Torus>("", cur_ptr, 1);
}
for (int i = 0; i < num_blocks * 2 * num_blocks; i++) {
printf("%d\n", vector_result_sb->degrees[i]);
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks, mem_ptr->luts_array);
2 * num_blocks);
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,

View File

@@ -115,13 +115,10 @@ __host__ void host_integer_scalar_mul_radix(
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
lwe_array, 0, num_radix_blocks);
} else {
for (int i = 0; i < j * num_radix_blocks; i++) {
all_shifted_buffer->degrees[i] = message_modulus - 1;
}
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true, bsks,
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j, nullptr);
num_radix_blocks, j);
auto scp_mem_ptr = mem->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;

View File

@@ -37,18 +37,21 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
printf("\n");
}
template <typename T>
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
for (int i = 0; i < N; i++) {
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
T clear = body / delta;
printf("(%lu, %lu), ", body, clear);
}
}
template <typename T>
void print_body(const char *name, T *src, int n, int lwe_dimension) {
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
cudaDeviceSynchronize();
printf("\n");
}

View File

@@ -1018,6 +1018,7 @@ unsafe extern "C" {
gpu_count: u32,
radix_lwe_out: *mut CudaRadixCiphertextFFI,
radix_lwe_vec: *mut CudaRadixCiphertextFFI,
reduce_degrees_for_single_carry_propagation: bool,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,

View File

@@ -4008,6 +4008,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
streams: &CudaStreams,
result: &mut CudaRadixCiphertext,
radix_list: &mut CudaRadixCiphertext,
reduce_degrees_for_single_carry_propagation: bool,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
@@ -4102,6 +4103,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
streams.len() as u32,
&raw mut cuda_ffi_result,
&raw mut cuda_ffi_radix_list,
reduce_degrees_for_single_carry_propagation,
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),

View File

@@ -337,6 +337,7 @@ impl CudaServerKey {
&self,
result: &mut T,
ciphertexts: &[T],
reduce_degrees_for_single_carry_propagation: bool,
streams: &CudaStreams,
) {
if ciphertexts.is_empty() {
@@ -377,6 +378,7 @@ impl CudaServerKey {
streams,
result.as_mut(),
&mut terms,
reduce_degrees_for_single_carry_propagation,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
@@ -402,6 +404,7 @@ impl CudaServerKey {
streams,
result.as_mut(),
&mut terms,
reduce_degrees_for_single_carry_propagation,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
@@ -445,7 +448,7 @@ impl CudaServerKey {
streams: &CudaStreams,
) -> T {
let mut result = self
.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
.unchecked_partial_sum_ciphertexts_async(ciphertexts, true, streams)
.unwrap();
self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
@@ -458,7 +461,8 @@ impl CudaServerKey {
ciphertexts: &[T],
streams: &CudaStreams,
) -> Option<T> {
let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams) };
let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts,
false, streams) };
streams.synchronize();
result
}
@@ -470,6 +474,7 @@ impl CudaServerKey {
pub unsafe fn unchecked_partial_sum_ciphertexts_async<T: CudaIntegerRadixCiphertext>(
&self,
ciphertexts: &[T],
reduce_degrees_for_single_carry_propagation: bool,
streams: &CudaStreams,
) -> Option<T> {
if ciphertexts.is_empty() {
@@ -483,7 +488,8 @@ impl CudaServerKey {
return Some(result);
}
self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts, streams);
self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts,
reduce_degrees_for_single_carry_propagation, streams);
Some(result)
}

View File

@@ -93,8 +93,7 @@ impl CudaServerKey {
},
);
let mut output_cts: T =
self.create_trivial_zero_radix_async(num_ct_blocks * num_ct_blocks, streams);
let mut output_cts: T = self.create_trivial_zero_radix_async(num_ct_blocks, streams);
self.compute_prefix_sum_hillis_steele_async(
output_cts.as_mut(),
@@ -455,7 +454,7 @@ impl CudaServerKey {
cts.push(new_trivial);
let result = self
.unchecked_partial_sum_ciphertexts_async(&cts, streams)
.unchecked_partial_sum_ciphertexts_async(&cts, false, streams)
.expect("internal error, empty ciphertext count");
// This is the part where we extract message and carry blocks
@@ -497,28 +496,6 @@ impl CudaServerKey {
.as_mut_slice(0..lwe_size, 0)
.unwrap();
let mut carry_blocks_last = carry_blocks
.as_mut()
.d_blocks
.0
.d_vec
.as_mut_slice(
lwe_size * (counter_num_blocks - 1)..lwe_size * counter_num_blocks,
0,
)
.unwrap();
carry_blocks_last.copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
carry_blocks.as_mut().info.blocks.last_mut().unwrap().degree =
Degree(self.message_modulus.0 - 1);
carry_blocks
.as_mut()
.info
.blocks
.last_mut()
.unwrap()
.noise_level = NoiseLevel::ZERO;
self.apply_lookup_table_async(
carry_blocks.as_mut(),
result.as_ref(),
@@ -527,10 +504,43 @@ impl CudaServerKey {
streams,
);
let mut rotated_carry_blocks: CudaSignedRadixCiphertext =
self.create_trivial_zero_radix(counter_num_blocks, streams);
let mut rotated_slice = rotated_carry_blocks
.as_mut()
.d_blocks
.0
.d_vec
.as_mut_slice(0..(counter_num_blocks) * lwe_size, 0)
.unwrap();
let first_block;
let last_blocks;
(first_block, last_blocks) = rotated_slice.split_at_mut(lwe_size, 0);
let mut tmp_carry_blocks3 = carry_blocks.duplicate(streams);
let carry_slice = tmp_carry_blocks3
.as_mut()
.d_blocks
.0
.d_vec
.as_mut_slice(0..(counter_num_blocks - 1) * lwe_size, 0)
.unwrap();
last_blocks
.unwrap()
.copy_from_gpu_async(&carry_slice, streams, 0);
first_block
.unwrap()
.copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
let mut ciphertexts = Vec::<CudaSignedRadixCiphertext>::with_capacity(3);
for mut info in &mut rotated_carry_blocks.ciphertext.info.blocks {
info.degree = Degree(self.message_modulus.0 - 1);
}
ciphertexts.push(message_blocks);
ciphertexts.push(carry_blocks);
ciphertexts.push(rotated_carry_blocks);
let trivial_ct: CudaSignedRadixCiphertext =
self.create_trivial_radix_async(2u32, counter_num_blocks, streams);

View File

@@ -1321,6 +1321,14 @@ impl CudaServerKey {
.unwrap();
let mut generates_or_propagates_degrees = vec![0; num_blocks];
let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
for (i, block_index) in (block_range.clone()).enumerate() {
generates_or_propagates_degrees[i] =
generates_or_propagates.info.blocks[block_index].degree.0;
generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks
[block_index]
.noise_level
.0;
}
let ct_modulus = output.d_blocks.ciphertext_modulus().raw_modulus_float();
let mut output_slice = output
.d_blocks

View File

@@ -19,29 +19,29 @@ where
P: Into<TestParameters> + Clone,
{
// Binary Ops Executors
let add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::add);
let sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::sub);
//let add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::add);
//let sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::sub);
let bitwise_and_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitand);
let bitwise_or_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitor);
let bitwise_xor_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::bitxor);
let mul_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::mul);
let max_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::max);
let min_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::min);
//let max_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::max);
//let min_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::min);
// Binary Ops Clear functions
let clear_add = |x, y| x + y;
let clear_sub = |x, y| x - y;
//let clear_add = |x, y| x + y;
//let clear_sub = |x, y| x - y;
let clear_bitwise_and = |x, y| x & y;
let clear_bitwise_or = |x, y| x | y;
let clear_bitwise_xor = |x, y| x ^ y;
let clear_mul = |x, y| x * y;
let clear_max = |x: i64, y: i64| max(x, y);
let clear_min = |x: i64, y: i64| min(x, y);
//let clear_max = |x: i64, y: i64| max(x, y);
//let clear_min = |x: i64, y: i64| min(x, y);
#[allow(clippy::type_complexity)]
let mut binary_ops: Vec<(SignedBinaryOpExecutor, &dyn Fn(i64, i64) -> i64, String)> = vec![
(Box::new(add_executor), &clear_add, "add".to_string()),
(Box::new(sub_executor), &clear_sub, "sub".to_string()),
//(Box::new(add_executor), &clear_add, "add".to_string()),
//(Box::new(sub_executor), &clear_sub, "sub".to_string()),
(
Box::new(bitwise_and_executor),
&clear_bitwise_and,
@@ -58,14 +58,14 @@ where
"bitxor".to_string(),
),
(Box::new(mul_executor), &clear_mul, "mul".to_string()),
(Box::new(max_executor), &clear_max, "max".to_string()),
(Box::new(min_executor), &clear_min, "min".to_string()),
//(Box::new(max_executor), &clear_max, "max".to_string()),
//(Box::new(min_executor), &clear_min, "min".to_string()),
];
let rotate_left_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_left);
let left_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::left_shift);
let rotate_right_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_right);
let right_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::right_shift);
//let left_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::left_shift);
//let rotate_right_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::rotate_right);
//let right_shift_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::right_shift);
// Warning this rotate definition only works with 64-bit ciphertexts
let clear_rotate_left = |x: i64, y: u64| x.rotate_left(y as u32);
let clear_left_shift = |x: i64, y: u64| x << y;
@@ -83,21 +83,21 @@ where
&clear_rotate_left,
"rotate left".to_string(),
),
(
Box::new(left_shift_executor),
&clear_left_shift,
"left shift".to_string(),
),
(
Box::new(rotate_right_executor),
&clear_rotate_right,
"rotate right".to_string(),
),
(
Box::new(right_shift_executor),
&clear_right_shift,
"right shift".to_string(),
),
//(
// Box::new(left_shift_executor),
// &clear_left_shift,
// "left shift".to_string(),
//),
//(
// Box::new(rotate_right_executor),
// &clear_rotate_right,
// "rotate right".to_string(),
//),
//(
// Box::new(right_shift_executor),
// &clear_right_shift,
// "right shift".to_string(),
//),
];
// Unary Ops Executors
@@ -125,8 +125,8 @@ where
];
// Scalar binary Ops Executors
let scalar_add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_add);
let scalar_sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_sub);
//let scalar_add_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_add);
//let scalar_sub_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_sub);
let scalar_bitwise_and_executor =
GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_bitand);
let scalar_bitwise_or_executor =
@@ -141,16 +141,16 @@ where
&dyn Fn(i64, i64) -> i64,
String,
)> = vec![
(
Box::new(scalar_add_executor),
&clear_add,
"scalar add".to_string(),
),
(
Box::new(scalar_sub_executor),
&clear_sub,
"scalar sub".to_string(),
),
//(
// Box::new(scalar_add_executor),
// &clear_add,
// "scalar add".to_string(),
//),
//(
// Box::new(scalar_sub_executor),
// &clear_sub,
// "scalar sub".to_string(),
//),
(
Box::new(scalar_bitwise_and_executor),
&clear_bitwise_and,
@@ -175,12 +175,12 @@ where
let scalar_rotate_left_executor =
GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_left);
let scalar_left_shift_executor =
GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_left_shift);
let scalar_rotate_right_executor =
GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_right);
let scalar_right_shift_executor =
GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_right_shift);
//let scalar_left_shift_executor =
// GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_left_shift);
//let scalar_rotate_right_executor =
// GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_rotate_right);
//let scalar_right_shift_executor =
// GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_right_shift);
#[allow(clippy::type_complexity)]
let mut scalar_shift_rotate_ops: Vec<(
SignedScalarShiftRotateExecutor,
@@ -192,21 +192,21 @@ where
&clear_rotate_left,
"scalar rotate left".to_string(),
),
(
Box::new(scalar_left_shift_executor),
&clear_left_shift,
"scalar left shift".to_string(),
),
(
Box::new(scalar_rotate_right_executor),
&clear_rotate_right,
"scalar rotate right".to_string(),
),
(
Box::new(scalar_right_shift_executor),
&clear_right_shift,
"scalar right shift".to_string(),
),
//(
// Box::new(scalar_left_shift_executor),
// &clear_left_shift,
// "scalar left shift".to_string(),
//),
//(
// Box::new(scalar_rotate_right_executor),
// &clear_rotate_right,
// "scalar rotate right".to_string(),
//),
//(
// Box::new(scalar_right_shift_executor),
// &clear_right_shift,
// "scalar right shift".to_string(),
//),
];
// Overflowing Ops Executors
@@ -271,11 +271,11 @@ where
// Comparison Ops Executors
let gt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::gt);
let ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ge);
let lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::lt);
let le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::le);
let eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::eq);
let ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ne);
//let ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ge);
//let lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::lt);
//let le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::le);
//let eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::eq);
//let ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::ne);
// Comparison Ops Clear functions
let clear_gt = |x: i64, y: i64| -> bool { x > y };
@@ -292,20 +292,20 @@ where
String,
)> = vec![
(Box::new(gt_executor), &clear_gt, "gt".to_string()),
(Box::new(ge_executor), &clear_ge, "ge".to_string()),
(Box::new(lt_executor), &clear_lt, "lt".to_string()),
(Box::new(le_executor), &clear_le, "le".to_string()),
(Box::new(eq_executor), &clear_eq, "eq".to_string()),
(Box::new(ne_executor), &clear_ne, "ne".to_string()),
//(Box::new(ge_executor), &clear_ge, "ge".to_string()),
//(Box::new(lt_executor), &clear_lt, "lt".to_string()),
//(Box::new(le_executor), &clear_le, "le".to_string()),
//(Box::new(eq_executor), &clear_eq, "eq".to_string()),
//(Box::new(ne_executor), &clear_ne, "ne".to_string()),
];
// Scalar Comparison Ops Executors
let scalar_gt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_gt);
let scalar_ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ge);
let scalar_lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_lt);
let scalar_le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_le);
let scalar_eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_eq);
let scalar_ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ne);
//let scalar_ge_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ge);
//let scalar_lt_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_lt);
//let scalar_le_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_le);
//let scalar_eq_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_eq);
//let scalar_ne_executor = GpuMultiDeviceFunctionExecutor::new(&CudaServerKey::scalar_ne);
#[allow(clippy::type_complexity)]
let mut scalar_comparison_ops: Vec<(
@@ -318,31 +318,31 @@ where
&clear_gt,
"scalar gt".to_string(),
),
(
Box::new(scalar_ge_executor),
&clear_ge,
"scalar ge".to_string(),
),
(
Box::new(scalar_lt_executor),
&clear_lt,
"scalar lt".to_string(),
),
(
Box::new(scalar_le_executor),
&clear_le,
"scalar le".to_string(),
),
(
Box::new(scalar_eq_executor),
&clear_eq,
"scalar eq".to_string(),
),
(
Box::new(scalar_ne_executor),
&clear_ne,
"scalar ne".to_string(),
),
//(
// Box::new(scalar_ge_executor),
// &clear_ge,
// "scalar ge".to_string(),
//),
//(
// Box::new(scalar_lt_executor),
// &clear_lt,
// "scalar lt".to_string(),
//),
//(
// Box::new(scalar_le_executor),
// &clear_le,
// "scalar le".to_string(),
//),
//(
// Box::new(scalar_eq_executor),
// &clear_eq,
// "scalar eq".to_string(),
//),
//(
// Box::new(scalar_ne_executor),
// &clear_ne,
// "scalar ne".to_string(),
//),
];
// Select Executor

View File

@@ -289,6 +289,11 @@ impl ServerKey {
.unchecked_partial_sum_ciphertexts_vec_parallelized(cts, None)
.expect("internal error, empty ciphertext count");
for block in &result.blocks {
let val_clear = block.ct.get_body().data / 576460752303423488u64;
println!("cpu_after_first_partial_sum: {:?} {:?} {:?}", block.ct.get_body().data,
val_clear, block.degree.0);
}
// This is the part where we extract message and carry blocks
// while inverting their bits
let (message_blocks, carry_blocks) = rayon::join(
@@ -336,6 +341,11 @@ impl ServerKey {
)
.unwrap();
for block in &result.blocks {
let val_clear = block.ct.get_body().data / 576460752303423488u64;
println!("cpu_after_last_sum: {:?} {:?} {:?}", block.ct.get_body().data,
val_clear, block.degree.0);
}
self.cast_to_unsigned(result, counter_num_blocks)
}

View File

@@ -450,8 +450,27 @@ impl ServerKey {
return;
}
// for block in lhs.blocks() {
// println!("lhs_degrees: {:?}", block.degree)
// }
//
// for block in rhs.blocks() {
// println!("rhs_degrees: {:?}", block.degree)
// }
//
let terms = self.compute_terms_for_mul_low(lhs, rhs);
println!("cpu_terms");
for radix_ct in &terms {
for block in radix_ct.blocks() {
println!("{:?}", block.ct.get_body().data);
}
}
for radix_ct in &terms {
for block in radix_ct.blocks() {
println!("{:?}", block.degree.0);
}
}
if let Some(result) = self.unchecked_sum_ciphertexts_vec_parallelized(terms) {
*lhs = result;
} else {

View File

@@ -1,5 +1,5 @@
use crate::integer::ciphertext::IntegerRadixCiphertext;
use crate::integer::{BooleanBlock, IntegerCiphertext, RadixCiphertext, ServerKey};
use crate::integer::{BooleanBlock, IntegerCiphertext, RadixCiphertext, ServerKey, SignedRadixCiphertext};
use crate::shortint::ciphertext::Degree;
use crate::shortint::Ciphertext;
use rayon::prelude::*;
@@ -158,6 +158,17 @@ impl ServerKey {
{
let mut result =
self.unchecked_partial_sum_ciphertexts_vec_parallelized(ciphertexts, None)?;
println!("cpu_after_sum_ct");
for block in result.blocks() {
println!("{:?}", block.ct.get_body().data)
}
println!("cpu_after_sum_clear");
for block in result.blocks() {
let body = block.ct.get_body().data;
let delta = 576460752303423488u64;
println!("{:?}", body / delta)
}
self.full_propagate_parallelized(&mut result);
assert!(result.block_carries_are_empty());
@@ -198,6 +209,14 @@ impl ServerKey {
}
});
for radix in &ciphertexts {
//let radix_ct : SignedRadixCiphertext = radix;
for block in radix.blocks() {
let val_clear = block.ct.get_body().data / 576460752303423488u64;
println!("cpu_input_sum: {:?} {:?} {:?}", block.ct.get_body().data,
val_clear, block.degree.0);
}
}
self.unchecked_sum_ciphertexts_vec_parallelized(ciphertexts)
}

View File

@@ -3,4 +3,4 @@ pub(crate) mod test_random_op_sequence;
pub(crate) mod test_signed_erc20;
pub(crate) mod test_signed_random_op_sequence;
pub(crate) const NB_CTXT_LONG_RUN: usize = 32;
pub(crate) const NB_TESTS_LONG_RUN: usize = 20000;
pub(crate) const NB_TESTS_LONG_RUN: usize = 200;

View File

@@ -578,16 +578,19 @@ pub(crate) fn random_op_sequence_test<P>(
"Noise level greater than nominal value on op {fn_name} for block {k}",
)
});
// Determinism check
let res_1 = binary_op_executor.execute((&left_vec[i], &right_vec[i]));
assert_eq!(
res, res_1,
"Determinism check failed on binary op {fn_name} with clear inputs {clear_left} and {clear_right}.",
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
// Determinism check
let res_1 = binary_op_executor.execute((&left_vec[i], &right_vec[i]));
assert_eq!(
res, res_1,
"Determinism check failed on binary op {fn_name} with clear inputs {clear_left} and {clear_right} with input degrees {input_degrees_left:?} and {input_degrees_right:?}",
);
println!("Input degrees left: {input_degrees_left:?}, right {input_degrees_right:?}, Output degrees {:?}", output_degrees);
let decrypted_res: u64 = cks.decrypt(&res);
let expected_res: u64 = clear_fn(clear_left, clear_right);
@@ -633,13 +636,14 @@ pub(crate) fn random_op_sequence_test<P>(
"Noise level greater than nominal value on op {fn_name} for block {k}",
)
});
let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> = res.blocks.iter().map(|b| b.degree.0).collect();
// Determinism check
let res_1 = unary_op_executor.execute(input);
assert_eq!(
res, res_1,
"Determinism check failed on unary op {fn_name} with clear input {clear_input}.",
"Determinism check failed on unary op {fn_name} with clear input {clear_input} with input degrees {input_degrees:?}.",
);
let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
let decrypted_res: u64 = cks.decrypt(&res);
let expected_res: u64 = clear_fn(clear_input);
if i % 2 == 0 {
@@ -650,6 +654,7 @@ pub(crate) fn random_op_sequence_test<P>(
clear_right_vec[j] = expected_res;
}
println!("Input degrees: {input_degrees:?}, Output degrees {:?}", output_degrees);
// Correctness check
assert_eq!(
decrypted_res, expected_res,
@@ -675,16 +680,18 @@ pub(crate) fn random_op_sequence_test<P>(
"Noise level greater than nominal value on op {fn_name} for block {k}",
)
});
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
// Determinism check
let res_1 = scalar_binary_op_executor.execute((&left_vec[i], clear_right_vec[i]));
assert_eq!(
res, res_1,
"Determinism check failed on binary op {fn_name} with clear inputs {clear_left} and {clear_right}.",
"Determinism check failed on binary op {fn_name} with clear inputs {clear_left} and {clear_right} with input degrees {input_degrees_left:?}.",
);
let decrypted_res: u64 = cks.decrypt(&res);
let expected_res: u64 = clear_fn(clear_left, clear_right);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
if i % 2 == 0 {
left_vec[j] = res.clone();
@@ -693,6 +700,7 @@ pub(crate) fn random_op_sequence_test<P>(
right_vec[j] = res.clone();
clear_right_vec[j] = expected_res;
}
println!("Input degrees: {input_degrees_left:?}, Output degrees {:?}", output_degrees);
// Correctness check
assert_eq!(
@@ -738,6 +746,9 @@ pub(crate) fn random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Input degrees left: {input_degrees_left:?}, right {input_degrees_right:?}, Output degrees {:?}", output_degrees);
let decrypted_res: u64 = cks.decrypt(&res);
let decrypted_overflow = cks.decrypt_bool(&overflow);
let (expected_res, expected_overflow) = clear_fn(clear_left, clear_right);
@@ -787,6 +798,9 @@ pub(crate) fn random_op_sequence_test<P>(
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Input degrees: {input_degrees_left:?}, Output degrees {:?}", output_degrees);
// Determinism check
let (res_1, overflow_1) =
scalar_overflowing_op_executor.execute((&left_vec[i], clear_right_vec[i]));
@@ -929,6 +943,9 @@ pub(crate) fn random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Input degrees left: {input_degrees_left:?}, Input degrees right: {input_degrees_right:?}, Output degrees {:?}", output_degrees);
let decrypted_res: u64 = cks.decrypt(&res);
let expected_res = clear_fn(clear_bool, clear_left, clear_right);
@@ -990,6 +1007,11 @@ pub(crate) fn random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees_q: Vec<u64> =
res_q.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees_r: Vec<u64> =
res_r.blocks.iter().map(|b| b.degree.0).collect();
println!("Input degrees left: {input_degrees_left:?}, Input degrees right: {input_degrees_right:?}, Output degrees q {:?}, Output degrees r {:?}", output_degrees_q, output_degrees_r);
let decrypted_res_q: u64 = cks.decrypt(&res_q);
let decrypted_res_r: u64 = cks.decrypt(&res_r);
let (expected_res_q, expected_res_r) = clear_fn(clear_left, clear_right);
@@ -1056,6 +1078,11 @@ pub(crate) fn random_op_sequence_test<P>(
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees_r: Vec<u64> =
res_r.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees_q: Vec<u64> =
res_q.blocks.iter().map(|b| b.degree.0).collect();
println!("Input degrees left: {input_degrees_left:?}, Output degrees q {:?}, Output degrees r {:?}", output_degrees_q, output_degrees_r);
let decrypted_res_q: u64 = cks.decrypt(&res_q);
let decrypted_res_r: u64 = cks.decrypt(&res_r);
let (expected_res_q, expected_res_r) = clear_fn(clear_left, clear_right);
@@ -1114,6 +1141,9 @@ pub(crate) fn random_op_sequence_test<P>(
"Determinism check failed on op {fn_name} with clear input {clear_input}.",
);
let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Input degrees: {input_degrees:?}, Output degrees {:?}", output_degrees);
let cast_res = sks.cast_to_unsigned(res, NB_CTXT_LONG_RUN);
let decrypted_res: u64 = cks.decrypt(&cast_res);
let expected_res = clear_fn(clear_input);

View File

@@ -670,16 +670,19 @@ pub(crate) fn signed_random_op_sequence_test<P>(
"Noise level greater than nominal value on op {fn_name} for block {k}",
)
});
// Determinism check
let res_1 = binary_op_executor.execute((&left_vec[i], &right_vec[i]));
assert_eq!(
res, res_1,
"Determinism check failed on binary op {fn_name} with clear inputs {clear_left} and {clear_right}.",
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
// Determinism check
let res_1 = binary_op_executor.execute((&left_vec[i], &right_vec[i]));
assert_eq!(
res, res_1,
"Determinism check failed on binary op {fn_name} with clear inputs {clear_left} and {clear_right} with input degrees {input_degrees_left:?} and {input_degrees_right:?}",
);
println!("Input degrees left: {input_degrees_left:?}, right {input_degrees_right:?}, Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let expected_res: i64 = clear_fn(clear_left, clear_right);
@@ -731,6 +734,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
"Determinism check failed on unary op {fn_name} with clear input {clear_input}.",
);
let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let expected_res: i64 = clear_fn(clear_input);
if i % 2 == 0 {
@@ -774,6 +780,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let expected_res: i64 = clear_fn(clear_left, clear_right);
@@ -829,6 +838,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let decrypt_signed_overflow = cks.decrypt_bool(&overflow);
let (expected_res, expected_overflow) = clear_fn(clear_left, clear_right);
@@ -889,6 +901,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let decrypt_signed_overflow = cks.decrypt_bool(&overflow);
let (expected_res, expected_overflow) = clear_fn(clear_left, clear_right);
@@ -1020,6 +1035,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let expected_res = clear_fn(clear_bool, clear_left, clear_right);
@@ -1081,6 +1099,12 @@ pub(crate) fn signed_random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
right_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees_q: Vec<u64> =
res_q.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees_r: Vec<u64> =
res_r.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees_q);
println!("Output degrees {:?}", output_degrees_r);
let decrypt_signed_res_q: i64 = cks.decrypt_signed(&res_q);
let decrypt_signed_res_r: i64 = cks.decrypt_signed(&res_r);
let (expected_res_q, expected_res_r) = clear_fn(clear_left, clear_right);
@@ -1147,6 +1171,12 @@ pub(crate) fn signed_random_op_sequence_test<P>(
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_q_degrees: Vec<u64> =
res_r.blocks.iter().map(|b| b.degree.0).collect();
let output_r_degrees: Vec<u64> =
res_r.blocks.iter().map(|b| b.degree.0).collect();
println!("Output r degrees {:?}", output_r_degrees);
println!("Output q degrees {:?}", output_q_degrees);
let decrypt_signed_res_q: i64 = cks.decrypt_signed(&res_q);
let decrypt_signed_res_r: i64 = cks.decrypt_signed(&res_r);
let (expected_res_q, expected_res_r) = clear_fn(clear_left, clear_right);
@@ -1205,6 +1235,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
"Determinism check failed on op {fn_name} with clear input {clear_input}.",
);
let input_degrees: Vec<u64> = input.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let cast_res = sks.cast_to_signed(res, NB_CTXT_LONG_RUN);
let decrypt_signed_res: i64 = cks.decrypt_signed(&cast_res);
let expected_res = clear_fn(clear_input) as i64;
@@ -1252,6 +1285,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let input_degrees_right: Vec<u64> =
unsigned_right.blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let expected_res: i64 = clear_fn(clear_left, clear_right as u64);
@@ -1297,6 +1333,9 @@ pub(crate) fn signed_random_op_sequence_test<P>(
);
let input_degrees_left: Vec<u64> =
left_vec[i].blocks.iter().map(|b| b.degree.0).collect();
let output_degrees: Vec<u64> =
res.blocks.iter().map(|b| b.degree.0).collect();
println!("Output degrees {:?}", output_degrees);
let decrypt_signed_res: i64 = cks.decrypt_signed(&res);
let expected_res: i64 = clear_fn(clear_left, clear_right as u64);