mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
refactor(gpu): refactor and optimize sum_ciphertext in cuda backend
This commit is contained in:
committed by
Agnès Leroy
parent
9864dba009
commit
1936ec6d84
@@ -400,8 +400,9 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
@@ -414,7 +415,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -553,8 +555,8 @@ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
@@ -249,8 +249,10 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes_in[i] = i;
|
||||
@@ -370,8 +372,10 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes_in[i] = i;
|
||||
@@ -466,8 +470,10 @@ template <typename Torus> struct int_radix_lut {
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
cudaMallocHost((void **)&h_lwe_indexes_out,
|
||||
num_radix_blocks * sizeof(Torus));
|
||||
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes_in[i] = i;
|
||||
@@ -582,6 +588,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
streams[i], gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -604,8 +611,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
lut_vec.clear();
|
||||
lut_indexes_vec.clear();
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
cudaFreeHost(h_lwe_indexes_in);
|
||||
cudaFreeHost(h_lwe_indexes_out);
|
||||
|
||||
if (!mem_reuse) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
@@ -1319,18 +1326,123 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
CudaRadixCiphertextFFI *new_blocks;
|
||||
CudaRadixCiphertextFFI *new_blocks_copy;
|
||||
CudaRadixCiphertextFFI *old_blocks;
|
||||
CudaRadixCiphertextFFI *small_lwe_vector;
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
int32_t *d_smart_copy_in;
|
||||
int32_t *d_smart_copy_out;
|
||||
|
||||
bool mem_reuse = false;
|
||||
uint32_t max_total_blocks_in_vec;
|
||||
uint32_t num_blocks_in_radix;
|
||||
uint32_t max_num_radix_in_vec;
|
||||
uint32_t chunk_size;
|
||||
uint64_t *size_tracker;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
// temporary buffers
|
||||
CudaRadixCiphertextFFI *current_blocks;
|
||||
CudaRadixCiphertextFFI *small_lwe_vector;
|
||||
|
||||
uint32_t *d_columns_data;
|
||||
uint32_t *d_columns_counter;
|
||||
uint32_t **d_columns;
|
||||
|
||||
uint32_t *d_new_columns_data;
|
||||
uint32_t *d_new_columns_counter;
|
||||
uint32_t **d_new_columns;
|
||||
|
||||
uint64_t *d_degrees;
|
||||
|
||||
// lookup table for extracting message and carry
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
|
||||
bool mem_reuse = false;
|
||||
bool allocated_luts_message_carry;
|
||||
|
||||
void setup_index_buffers(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes) {
|
||||
|
||||
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, gpu_memory_allocated);
|
||||
|
||||
auto num_blocks_in_radix = this->num_blocks_in_radix;
|
||||
auto max_num_radix_in_vec = this->max_num_radix_in_vec;
|
||||
auto setup_columns =
|
||||
[num_blocks_in_radix, max_num_radix_in_vec, streams,
|
||||
gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
|
||||
uint32_t *&columns_counter, uint64_t *size_tracker,
|
||||
bool gpu_memory_allocated) {
|
||||
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
|
||||
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * sizeof(uint32_t), streams[0],
|
||||
gpu_indexes[0], size_tracker, gpu_memory_allocated);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0], gpu_memory_allocated);
|
||||
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
|
||||
for (int i = 0; i < num_blocks_in_radix; ++i) {
|
||||
h_columns[i] = columns_data + i * max_num_radix_in_vec;
|
||||
}
|
||||
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * sizeof(uint32_t *), streams[0],
|
||||
gpu_indexes[0], size_tracker, gpu_memory_allocated);
|
||||
if (gpu_memory_allocated) {
|
||||
cuda_memcpy_async_to_gpu(columns, h_columns,
|
||||
num_blocks_in_radix * sizeof(uint32_t *),
|
||||
streams[0], gpu_indexes[0]);
|
||||
}
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete[] h_columns;
|
||||
};
|
||||
|
||||
setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
|
||||
gpu_memory_allocated);
|
||||
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
|
||||
size_tracker, gpu_memory_allocated);
|
||||
}
|
||||
|
||||
void setup_lookup_tables(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
uint32_t message_modulus = params.message_modulus;
|
||||
|
||||
if (!mem_reuse) {
|
||||
uint32_t pbs_count = std::max(2 * (max_total_blocks_in_vec / chunk_size),
|
||||
2 * num_blocks_in_radix);
|
||||
if (max_total_blocks_in_vec > 0) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 2, pbs_count,
|
||||
gpu_memory_allocated, size_tracker);
|
||||
} else {
|
||||
allocated_luts_message_carry = false;
|
||||
}
|
||||
}
|
||||
if (allocated_luts_message_carry) {
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0),
|
||||
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_message, gpu_memory_allocated);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc,
|
||||
luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
}
|
||||
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
@@ -1339,103 +1451,87 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
this->params = params;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->mem_reuse = false;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
this->num_blocks_in_radix = num_blocks_in_radix;
|
||||
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->size_tracker = size_tracker;
|
||||
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
|
||||
(params.message_modulus - 1);
|
||||
this->allocated_luts_message_carry = true;
|
||||
setup_index_buffers(streams, gpu_indexes);
|
||||
setup_lookup_tables(streams, gpu_indexes, gpu_count);
|
||||
|
||||
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
|
||||
// allocate gpu memory for intermediate buffers
|
||||
new_blocks = new CudaRadixCiphertextFFI;
|
||||
// create and allocate intermediate buffers
|
||||
current_blocks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
new_blocks_copy = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
old_blocks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
|
||||
streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
|
||||
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
|
||||
CudaRadixCiphertextFFI *old_blocks,
|
||||
CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
|
||||
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
|
||||
CudaRadixCiphertextFFI *small_lwe_vector,
|
||||
int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
mem_reuse = true;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->mem_reuse = true;
|
||||
this->params = params;
|
||||
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
this->num_blocks_in_radix = num_blocks_in_radix;
|
||||
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->size_tracker = size_tracker;
|
||||
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
|
||||
(params.message_modulus - 1);
|
||||
this->allocated_luts_message_carry = true;
|
||||
|
||||
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
|
||||
// assign gpu memory for intermediate buffers
|
||||
this->new_blocks = new_blocks;
|
||||
this->old_blocks = old_blocks;
|
||||
this->current_blocks = current_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
new_blocks_copy = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
this->luts_message_carry = reused_lut;
|
||||
setup_index_buffers(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
|
||||
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
|
||||
cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
if (!mem_reuse) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
small_lwe_vector, gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete new_blocks;
|
||||
delete old_blocks;
|
||||
if (allocated_luts_message_carry) {
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete luts_message_carry;
|
||||
}
|
||||
|
||||
delete current_blocks;
|
||||
delete small_lwe_vector;
|
||||
}
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
|
||||
gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete new_blocks_copy;
|
||||
}
|
||||
};
|
||||
|
||||
// For sequential algorithm in group propagation
|
||||
template <typename Torus> struct int_seq_group_prop_memory {
|
||||
|
||||
@@ -2752,7 +2848,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
// radix_lwe_left except the last blocks of each shift
|
||||
int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;
|
||||
|
||||
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
|
||||
int total_block_count = num_radix_blocks * num_radix_blocks;
|
||||
|
||||
// allocate memory for intermediate buffers
|
||||
vector_result_sb = new CudaRadixCiphertextFFI;
|
||||
@@ -2765,13 +2861,13 @@ template <typename Torus> struct int_mul_memory {
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
|
||||
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// create int_radix_lut objects for lsb, msb, message, carry
|
||||
// luts_array -> lut = {lsb_acc, msb_acc}
|
||||
luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
|
||||
params, 2, total_block_count,
|
||||
params, 2, 2 * total_block_count,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
auto lsb_acc = luts_array->get_lut(0, 0);
|
||||
auto msb_acc = luts_array->get_lut(0, 1);
|
||||
@@ -2808,7 +2904,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
|
||||
2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
uint32_t uses_carry = 0;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
@@ -4587,26 +4683,28 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
int_sc_prop_memory<Torus> *sc_prop_mem;
|
||||
bool anticipated_buffers_drop;
|
||||
bool gpu_memory_allocated;
|
||||
uint32_t num_ciphertext_bits;
|
||||
|
||||
int_scalar_mul_buffer(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory, bool anticipated_buffer_drop,
|
||||
uint64_t *size_tracker) {
|
||||
uint32_t num_scalar_bits, bool allocate_gpu_memory,
|
||||
bool anticipated_buffer_drop, uint64_t *size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
this->anticipated_buffers_drop = anticipated_buffer_drop;
|
||||
|
||||
uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
|
||||
size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
|
||||
num_ciphertext_bits = msg_bits * num_scalar_bits;
|
||||
|
||||
//// Contains all shifted values of lhs for shift in range (0..msg_bits)
|
||||
//// The idea is that with these we can create all other shift that are
|
||||
/// in / range (0..total_bits) for free (block rotation)
|
||||
preshifted_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], preshifted_buffer, num_ciphertext_bits,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
streams[0], gpu_indexes[0], preshifted_buffer,
|
||||
msg_bits * num_radix_blocks, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
all_shifted_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -4623,9 +4721,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
streams, gpu_indexes, gpu_count, LEFT_SHIFT, params, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_ciphertext_bits, allocate_gpu_memory, size_tracker);
|
||||
if (num_ciphertext_bits > 0) {
|
||||
sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
num_ciphertext_bits, allocate_gpu_memory, size_tracker);
|
||||
}
|
||||
uint32_t uses_carry = 0;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
@@ -4637,9 +4737,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
|
||||
uint32_t gpu_count) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
all_shifted_buffer, gpu_memory_allocated);
|
||||
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
|
||||
if (num_ciphertext_bits > 0) {
|
||||
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sum_ciphertexts_vec_mem;
|
||||
}
|
||||
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sum_ciphertexts_vec_mem;
|
||||
delete sc_prop_mem;
|
||||
delete all_shifted_buffer;
|
||||
if (!anticipated_buffers_drop) {
|
||||
@@ -4907,7 +5009,7 @@ template <typename Torus> struct int_scalar_mul_high {
|
||||
int_scalar_mul_high(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const int_radix_params params,
|
||||
uint32_t num_radix_blocks, const bool allocate_gpu_memory,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, uint32_t num_scalar_bits,
|
||||
bool anticipated_buffer_drop, uint64_t *size_tracker) {
|
||||
|
||||
this->params = params;
|
||||
@@ -4919,7 +5021,8 @@ template <typename Torus> struct int_scalar_mul_high {
|
||||
|
||||
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 2 * num_radix_blocks,
|
||||
allocate_gpu_memory, anticipated_buffer_drop, size_tracker);
|
||||
num_scalar_bits, allocate_gpu_memory, anticipated_buffer_drop,
|
||||
size_tracker);
|
||||
|
||||
this->tmp = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
|
||||
@@ -521,8 +521,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
if (num_radix_blocks > lut->num_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of lut radix blocks")
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_in->num_radix_blocks)
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of input & output radix blocks")
|
||||
|
||||
@@ -1835,9 +1834,6 @@ void host_propagate_single_carry(
|
||||
PUSH_RANGE("propagate sc")
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto lut_stride = mem->lut_stride;
|
||||
auto num_many_lut = mem->num_many_lut;
|
||||
CudaRadixCiphertextFFI output_flag;
|
||||
@@ -1853,6 +1849,7 @@ void host_propagate_single_carry(
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
|
||||
input_carries, 1);
|
||||
}
|
||||
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
|
||||
|
||||
@@ -226,72 +226,68 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
|
||||
// the general case
|
||||
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
|
||||
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
|
||||
}
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
|
||||
@@ -20,28 +20,11 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <queue>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
|
||||
int32_t *id_in, size_t lwe_size) {
|
||||
size_t tid = threadIdx.x;
|
||||
size_t b_id = blockIdx.x;
|
||||
size_t stride = blockDim.x;
|
||||
|
||||
auto input_id = id_in[b_id];
|
||||
auto output_id = id_out[b_id];
|
||||
|
||||
auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
|
||||
auto cur_dst = &dst[output_id * lwe_size];
|
||||
|
||||
for (int i = tid; i < lwe_size; i += stride) {
|
||||
cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void
|
||||
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
@@ -94,33 +77,155 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
uint32_t chunk_size, uint32_t block_size,
|
||||
uint32_t num_blocks) {
|
||||
__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
|
||||
uint32_t *const columns_counter,
|
||||
const uint64_t *const degrees,
|
||||
const uint32_t num_radix_blocks,
|
||||
const uint32_t num_radix_in_vec) {
|
||||
|
||||
size_t stride = blockDim.x;
|
||||
size_t chunk_id = blockIdx.x;
|
||||
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
|
||||
size_t radix_elem_size = num_blocks * block_size;
|
||||
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
|
||||
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
|
||||
size_t block_stride = blockIdx.y * block_size;
|
||||
auto result = &dst_radix[block_stride];
|
||||
|
||||
// init shared mem with first radix of chunk
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
result[i] = src_chunk[block_stride + i];
|
||||
}
|
||||
|
||||
// accumulate rest of the radixes
|
||||
for (int r_id = 1; r_id < chunk_size; r_id++) {
|
||||
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
|
||||
for (int i = tid; i < block_size; i += stride) {
|
||||
result[i] += cur_src_radix[block_stride + i];
|
||||
const uint32_t idx = threadIdx.x;
|
||||
size_t cnt = 0;
|
||||
for (int i = 0; i < num_radix_in_vec; i++) {
|
||||
size_t ct_id = i * num_radix_blocks + idx;
|
||||
if (degrees[ct_id] != 0) {
|
||||
columns[idx][cnt] = ct_id;
|
||||
++cnt;
|
||||
}
|
||||
}
|
||||
columns_counter[idx] = cnt;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_new_columns_and_pbs_indexes(
|
||||
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter, const uint32_t chunk_size) {
|
||||
__shared__ uint32_t counter;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
counter = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t base_id = threadIdx.x;
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
uint32_t ct_count = 0;
|
||||
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
|
||||
// those indexes are for message ciphertexts
|
||||
// for message ciphertexts in and out index should be same
|
||||
const uint32_t in_index = columns[base_id][i];
|
||||
new_columns[base_id][ct_count] = in_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = in_index;
|
||||
lut_indexes[pbs_index] = 0;
|
||||
++ct_count;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (base_id > 0) {
|
||||
const uint32_t prev_base_id = base_id - 1;
|
||||
const uint32_t prev_column_len = columns_counter[prev_base_id];
|
||||
|
||||
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
|
||||
// those indexes are for carry ciphertexts
|
||||
// for carry ciphertexts input is same as for message
|
||||
// output will be placed to next block in the column
|
||||
const uint32_t in_index = columns[prev_base_id][i];
|
||||
const uint32_t out_index = columns[prev_base_id][i + 1];
|
||||
new_columns[base_id][ct_count] = out_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = out_index;
|
||||
lut_indexes[pbs_index] = 1;
|
||||
++ct_count;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t start_index = column_len - column_len % chunk_size;
|
||||
for (uint32_t i = start_index; i < column_len; ++i) {
|
||||
new_columns[base_id][ct_count] = columns[base_id][i];
|
||||
++ct_count;
|
||||
}
|
||||
|
||||
new_columns_counter[base_id] = ct_count;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_final_pbs_indexes(
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
|
||||
int idx = threadIdx.x;
|
||||
pbs_indexes_in[idx] = idx % num_radix_blocks;
|
||||
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
|
||||
lut_indexes[idx] = idx / num_radix_blocks;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_chunks(Torus *const input_blocks,
|
||||
const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size,
|
||||
const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
if (column_len >= chunk_size) {
|
||||
const uint32_t num_chunks = column_len / chunk_size;
|
||||
Torus result = 0;
|
||||
|
||||
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
|
||||
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
|
||||
const uint32_t cur_ct_id =
|
||||
columns[base_id][chunk_id * chunk_size + ct_id];
|
||||
result += input_blocks[cur_ct_id * block_size + coef_id];
|
||||
}
|
||||
|
||||
input_blocks[first_ct_id * block_size + coef_id] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_final_chunk_into_radix(
|
||||
Torus *const out_radix, const Torus *const input_blocks,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size, const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
Torus result = 0;
|
||||
if (column_len) {
|
||||
const uint32_t first_ct_id = columns[base_id][0];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t i = 1; i < column_len; ++i) {
|
||||
const uint32_t cur_ct_it = columns[base_id][i];
|
||||
result += input_blocks[cur_ct_it * block_size + coef_id];
|
||||
}
|
||||
}
|
||||
out_radix[base_id * block_size + coef_id] = result;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -167,6 +272,113 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
(process_msb) ? cur_msb_ct[params::degree] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
struct radix_columns {
|
||||
std::vector<size_t> columns_counter;
|
||||
size_t num_blocks;
|
||||
size_t num_radix_in_vec;
|
||||
size_t chunk_size;
|
||||
radix_columns(const uint64_t *const input_degrees, size_t num_blocks,
|
||||
size_t num_radix_in_vec, size_t chunk_size,
|
||||
bool &needs_processing)
|
||||
: num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
|
||||
chunk_size(chunk_size) {
|
||||
needs_processing = false;
|
||||
columns_counter.resize(num_blocks, 0);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns_counter[j] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
if (columns_counter[i] > chunk_size) {
|
||||
needs_processing = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void next_accumulation(size_t &total_ciphertexts, size_t &message_ciphertexts,
|
||||
bool &needs_processing) {
|
||||
message_ciphertexts = 0;
|
||||
total_ciphertexts = 0;
|
||||
needs_processing = false;
|
||||
for (int i = num_blocks - 1; i > 0; --i) {
|
||||
size_t cur_count = columns_counter[i];
|
||||
size_t prev_count = columns_counter[i - 1];
|
||||
size_t new_count = 0;
|
||||
|
||||
// accumulated_blocks from current columns
|
||||
new_count += cur_count / chunk_size;
|
||||
// all accumulated message blocks needs pbs
|
||||
message_ciphertexts += new_count;
|
||||
// carry blocks from previous columns
|
||||
new_count += prev_count / chunk_size;
|
||||
// both carry and message blocks that needs pbs
|
||||
total_ciphertexts += new_count;
|
||||
// now add remaining non accumulated blocks that does not require pbs
|
||||
new_count += cur_count % chunk_size;
|
||||
|
||||
columns_counter[i] = new_count;
|
||||
|
||||
if (new_count > chunk_size)
|
||||
needs_processing = true;
|
||||
}
|
||||
|
||||
// now do it for 0th block
|
||||
size_t new_count = columns_counter[0] / chunk_size;
|
||||
message_ciphertexts += new_count;
|
||||
total_ciphertexts += new_count;
|
||||
new_count += columns_counter[0] % chunk_size;
|
||||
columns_counter[0] = new_count;
|
||||
|
||||
if (new_count > chunk_size) {
|
||||
needs_processing = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline void calculate_final_degrees(uint64_t *const out_degrees,
|
||||
const uint64_t *const input_degrees,
|
||||
size_t num_blocks, size_t num_radix_in_vec,
|
||||
size_t chunk_size,
|
||||
uint64_t message_modulus) {
|
||||
|
||||
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
|
||||
return std::min(message_modulus - 1, degree);
|
||||
};
|
||||
std::vector<std::queue<uint64_t>> columns(num_blocks);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns[j].push(input_degrees[i * num_blocks + j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
auto &col = columns[i];
|
||||
while (col.size() > 1) {
|
||||
uint32_t cur_degree = 0;
|
||||
size_t mn = std::min(chunk_size, col.size());
|
||||
for (int j = 0; j < mn; ++j) {
|
||||
cur_degree += col.front();
|
||||
col.pop();
|
||||
}
|
||||
const uint64_t new_degree = get_degree(cur_degree);
|
||||
col.push(new_degree);
|
||||
if ((i + 1) < num_blocks) {
|
||||
columns[i + 1].push(new_degree);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -185,11 +397,14 @@ template <typename Torus, class params>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaRadixCiphertextFFI *terms,
|
||||
bool reduce_degrees_for_single_carry_propagation, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
|
||||
if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
|
||||
PANIC("Cuda error: output and input radix ciphertexts should have the same "
|
||||
@@ -199,22 +414,29 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
PANIC("Cuda error: input vector does not have enough blocks")
|
||||
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
|
||||
PANIC("Cuda error: output does not have enough blocks")
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto new_blocks_copy = mem_ptr->new_blocks_copy;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
if (num_radix_in_vec == 0)
|
||||
return;
|
||||
|
||||
auto current_blocks = mem_ptr->current_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
auto d_degrees = mem_ptr->d_degrees;
|
||||
auto d_columns = mem_ptr->d_columns;
|
||||
auto d_columns_counter = mem_ptr->d_columns_counter;
|
||||
auto d_new_columns = mem_ptr->d_new_columns;
|
||||
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
|
||||
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
|
||||
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
auto chunk_size =
|
||||
(mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
|
||||
(mem_ptr->params.message_modulus - 1);
|
||||
|
||||
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t num_many_lut = 1;
|
||||
@@ -228,244 +450,166 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
terms, 0, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
if (old_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
|
||||
terms);
|
||||
}
|
||||
|
||||
if (num_radix_in_vec == 2) {
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
CudaRadixCiphertextFFI terms_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
|
||||
2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
|
||||
&terms_slice, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t r = num_radix_in_vec;
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
size_t message_max = message_modulus - 1;
|
||||
size_t chunk_size = (total_modulus - 1) / message_max;
|
||||
|
||||
size_t h_lwe_idx_in[terms->num_radix_blocks];
|
||||
size_t h_lwe_idx_out[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_in[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_out[terms->num_radix_blocks];
|
||||
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
auto max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
if (reused_lut == nullptr) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, true, nullptr);
|
||||
} else {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
|
||||
if (mem_ptr->mem_reuse) {
|
||||
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
|
||||
}
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
if (current_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
current_blocks, terms);
|
||||
}
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_message, true);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry, true);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
|
||||
total_blocks_in_vec * sizeof(uint64_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
while (r > 2) {
|
||||
size_t cur_total_blocks = r * num_radix_blocks;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_radix_blocks, 1);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
|
||||
num_radix_in_vec);
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
|
||||
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
|
||||
bool needs_processing = false;
|
||||
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
|
||||
num_radix_in_vec, chunk_size, needs_processing);
|
||||
int number_of_threads = min(256, params::degree);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
while (needs_processing) {
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
chunk_size, big_lwe_size);
|
||||
|
||||
size_t total_count = 0;
|
||||
size_t message_count = 0;
|
||||
size_t carry_count = 0;
|
||||
size_t sm_copy_count = 0;
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
|
||||
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
|
||||
d_columns_counter, chunk_size);
|
||||
|
||||
generate_ids_update_degrees(
|
||||
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
|
||||
message_max, total_count, message_count, carry_count, sm_copy_count);
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_lwe_idx_in, h_lwe_idx_out);
|
||||
size_t total_ciphertexts;
|
||||
size_t total_messages;
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
needs_processing);
|
||||
|
||||
size_t copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
|
||||
new_blocks, 0, r * num_radix_blocks);
|
||||
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
|
||||
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
if (carry_count > 0)
|
||||
cuda_set_value_async<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
luts_message_carry->get_lut_indexes(0, message_count), 1,
|
||||
carry_count);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
|
||||
std::vector<Torus *> small_lwe_vector_vec =
|
||||
luts_message_carry->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec =
|
||||
luts_message_carry->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
luts_message_carry->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
|
||||
polynomial_size * glwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
|
||||
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
|
||||
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
big_lwe_size);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
|
||||
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
|
||||
total_messages);
|
||||
|
||||
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
|
||||
/// different configuration
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
|
||||
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
small_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
|
||||
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
small_lwe_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->lwe_indexes_in,
|
||||
total_ciphertexts * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->lwe_indexes_out,
|
||||
total_ciphertexts * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
|
||||
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
big_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks,
|
||||
current_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
for (uint i = 0; i < total_count; i++) {
|
||||
auto degrees_index = luts_message_carry->h_lut_indexes[i];
|
||||
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
|
||||
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
|
||||
|
||||
if (rem_blocks > 0)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
|
||||
new_blocks_created + rem_blocks, old_blocks,
|
||||
cur_total_blocks - rem_blocks, cur_total_blocks);
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
std::swap(d_columns, d_new_columns);
|
||||
std::swap(d_columns_counter, d_new_columns_counter);
|
||||
}
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
calculate_final_chunk_into_radix<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
|
||||
d_columns, d_columns_counter, chunk_size, big_lwe_size);
|
||||
|
||||
if (reduce_degrees_for_single_carry_propagation) {
|
||||
prepare_final_pbs_indexes<Torus>
|
||||
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
cuda_memset_async(
|
||||
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
|
||||
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->lwe_indexes_in,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_cpu(luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->lwe_indexes_out,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
|
||||
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
|
||||
2 * num_radix_blocks);
|
||||
}
|
||||
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
|
||||
num_radix_blocks, num_radix_in_vec, chunk_size,
|
||||
mem_ptr->params.message_modulus);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
CudaRadixCiphertextFFI current_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_blocks_slice, current_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
|
||||
current_blocks, ¤t_blocks_slice,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -600,9 +744,9 @@ __host__ void host_integer_mult_radix_kb(
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem,
|
||||
num_blocks, 2 * num_blocks);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
|
||||
@@ -6,7 +6,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -17,7 +18,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
num_scalar_bits, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
@@ -26,8 +27,8 @@ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -38,7 +39,7 @@ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
anticipated_buffer_drop, allocate_gpu_memory);
|
||||
num_scalar_bits, anticipated_buffer_drop, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
|
||||
@@ -33,12 +33,12 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
allocate_gpu_memory, true, &size_tracker);
|
||||
num_scalar_bits, allocate_gpu_memory, true, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -116,13 +116,10 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
} else {
|
||||
for (int i = 0; i < j * num_radix_blocks; i++) {
|
||||
all_shifted_buffer->degrees[i] = message_modulus - 1;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j, nullptr);
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true,
|
||||
bsks, ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
@@ -177,13 +174,15 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
bool anticipated_buffer_drop, bool allocate_gpu_memory) {
|
||||
uint32_t num_scalar_bits, bool anticipated_buffer_drop,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_scalar_mul_high<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
allocate_gpu_memory, LEFT_SHIFT, anticipated_buffer_drop, &size_tracker);
|
||||
allocate_gpu_memory, LEFT_SHIFT, num_scalar_bits, anticipated_buffer_drop,
|
||||
&size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -38,17 +38,19 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
|
||||
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
|
||||
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
|
||||
T clear = body / delta;
|
||||
printf("(%lu, %lu), ", body, clear);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension) {
|
||||
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
|
||||
printf("%s: ", name);
|
||||
cudaDeviceSynchronize();
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
|
||||
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
|
||||
cudaDeviceSynchronize();
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -1018,6 +1018,7 @@ unsafe extern "C" {
|
||||
gpu_count: u32,
|
||||
radix_lwe_out: *mut CudaRadixCiphertextFFI,
|
||||
radix_lwe_vec: *mut CudaRadixCiphertextFFI,
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
@@ -1050,6 +1051,7 @@ unsafe extern "C" {
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
num_scalar_bits: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
@@ -1350,6 +1352,7 @@ unsafe extern "C" {
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
num_scalar_bits: u32,
|
||||
anticipated_buffer_drop: bool,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
|
||||
@@ -354,6 +354,14 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
&mut lwe_array_degrees,
|
||||
&mut lwe_array_noise_levels,
|
||||
);
|
||||
let msg_bits = message_modulus.0.ilog2() as usize;
|
||||
let num_blocks = lwe_array.d_blocks.lwe_ciphertext_count().0 as u32;
|
||||
let num_ciphertext_bits = msg_bits * num_blocks as usize;
|
||||
let num_scalar_bits = decomposed_scalar
|
||||
.iter()
|
||||
.take(num_ciphertext_bits)
|
||||
.filter(|&&rhs_bit| rhs_bit == T::ONE)
|
||||
.count() as u32;
|
||||
|
||||
scratch_cuda_integer_scalar_mul_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -372,6 +380,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
num_scalar_bits,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
@@ -402,8 +411,9 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
pub fn get_scalar_mul_integer_radix_kb_size_on_gpu<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
decomposed_scalar: &[T],
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
glwe_dimension: GlweDimension,
|
||||
@@ -420,6 +430,14 @@ pub fn get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
) -> u64 {
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let msg_bits = message_modulus.0.ilog2() as usize;
|
||||
let num_ciphertext_bits = msg_bits * num_blocks as usize;
|
||||
let num_scalar_bits = decomposed_scalar
|
||||
.iter()
|
||||
.take(num_ciphertext_bits)
|
||||
.filter(|&&rhs_bit| rhs_bit == T::ONE)
|
||||
.count() as u32;
|
||||
|
||||
let size_tracker = unsafe {
|
||||
scratch_cuda_integer_scalar_mul_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -438,6 +456,7 @@ pub fn get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
num_scalar_bits,
|
||||
false,
|
||||
allocate_ms_noise_array,
|
||||
)
|
||||
@@ -2726,9 +2745,15 @@ pub unsafe fn unchecked_scalar_mul_high_integer_radix_kb_async<
|
||||
has_at_least_one_set[i % msg_bits] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
let value_rhs: u64 = rhs.cast_into();
|
||||
|
||||
let num_ciphertext_bits = msg_bits * num_blocks as usize;
|
||||
let num_scalar_bits = decomposed_scalar
|
||||
.iter()
|
||||
.take(num_ciphertext_bits)
|
||||
.filter(|&&rhs_bit| rhs_bit == 1)
|
||||
.count() as u32;
|
||||
|
||||
scratch_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
@@ -2746,6 +2771,7 @@ pub unsafe fn unchecked_scalar_mul_high_integer_radix_kb_async<
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
num_scalar_bits,
|
||||
true,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
@@ -4536,6 +4562,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
streams: &CudaStreams,
|
||||
result: &mut CudaRadixCiphertext,
|
||||
radix_list: &mut CudaRadixCiphertext,
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
bootstrapping_key: &CudaVec<B>,
|
||||
keyswitch_key: &CudaVec<T>,
|
||||
message_modulus: MessageModulus,
|
||||
@@ -4630,6 +4657,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
streams.len() as u32,
|
||||
&raw mut cuda_ffi_result,
|
||||
&raw mut cuda_ffi_radix_list,
|
||||
reduce_degrees_for_single_carry_propagation,
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
|
||||
@@ -338,6 +338,7 @@ impl CudaServerKey {
|
||||
&self,
|
||||
result: &mut T,
|
||||
ciphertexts: &[T],
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
streams: &CudaStreams,
|
||||
) {
|
||||
if ciphertexts.is_empty() {
|
||||
@@ -378,6 +379,7 @@ impl CudaServerKey {
|
||||
streams,
|
||||
result.as_mut(),
|
||||
&mut terms,
|
||||
reduce_degrees_for_single_carry_propagation,
|
||||
&d_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.message_modulus,
|
||||
@@ -403,6 +405,7 @@ impl CudaServerKey {
|
||||
streams,
|
||||
result.as_mut(),
|
||||
&mut terms,
|
||||
reduce_degrees_for_single_carry_propagation,
|
||||
&d_multibit_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.message_modulus,
|
||||
@@ -446,7 +449,7 @@ impl CudaServerKey {
|
||||
streams: &CudaStreams,
|
||||
) -> T {
|
||||
let mut result = self
|
||||
.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
|
||||
.unchecked_partial_sum_ciphertexts_async(ciphertexts, true, streams)
|
||||
.unwrap();
|
||||
|
||||
self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
|
||||
@@ -459,7 +462,8 @@ impl CudaServerKey {
|
||||
ciphertexts: &[T],
|
||||
streams: &CudaStreams,
|
||||
) -> Option<T> {
|
||||
let result = unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams) };
|
||||
let result =
|
||||
unsafe { self.unchecked_partial_sum_ciphertexts_async(ciphertexts, false, streams) };
|
||||
streams.synchronize();
|
||||
result
|
||||
}
|
||||
@@ -471,6 +475,7 @@ impl CudaServerKey {
|
||||
pub unsafe fn unchecked_partial_sum_ciphertexts_async<T: CudaIntegerRadixCiphertext>(
|
||||
&self,
|
||||
ciphertexts: &[T],
|
||||
reduce_degrees_for_single_carry_propagation: bool,
|
||||
streams: &CudaStreams,
|
||||
) -> Option<T> {
|
||||
if ciphertexts.is_empty() {
|
||||
@@ -484,7 +489,12 @@ impl CudaServerKey {
|
||||
return Some(result);
|
||||
}
|
||||
|
||||
self.unchecked_partial_sum_ciphertexts_assign_async(&mut result, ciphertexts, streams);
|
||||
self.unchecked_partial_sum_ciphertexts_assign_async(
|
||||
&mut result,
|
||||
ciphertexts,
|
||||
reduce_degrees_for_single_carry_propagation,
|
||||
streams,
|
||||
);
|
||||
|
||||
Some(result)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::integer::gpu::reverse_blocks_inplace_async;
|
||||
use crate::integer::gpu::server_key::CudaServerKey;
|
||||
use crate::integer::server_key::radix_parallel::ilog2::{BitValue, Direction};
|
||||
use crate::shortint::ciphertext::Degree;
|
||||
use crate::shortint::parameters::NoiseLevel;
|
||||
|
||||
impl CudaServerKey {
|
||||
/// This function takes a ciphertext in radix representation
|
||||
@@ -93,8 +92,7 @@ impl CudaServerKey {
|
||||
},
|
||||
);
|
||||
|
||||
let mut output_cts: T =
|
||||
self.create_trivial_zero_radix_async(num_ct_blocks * num_ct_blocks, streams);
|
||||
let mut output_cts: T = self.create_trivial_zero_radix_async(num_ct_blocks, streams);
|
||||
|
||||
self.compute_prefix_sum_hillis_steele_async(
|
||||
output_cts.as_mut(),
|
||||
@@ -455,7 +453,7 @@ impl CudaServerKey {
|
||||
cts.push(new_trivial);
|
||||
|
||||
let result = self
|
||||
.unchecked_partial_sum_ciphertexts_async(&cts, streams)
|
||||
.unchecked_partial_sum_ciphertexts_async(&cts, false, streams)
|
||||
.expect("internal error, empty ciphertext count");
|
||||
|
||||
// This is the part where we extract message and carry blocks
|
||||
@@ -497,28 +495,6 @@ impl CudaServerKey {
|
||||
.as_mut_slice(0..lwe_size, 0)
|
||||
.unwrap();
|
||||
|
||||
let mut carry_blocks_last = carry_blocks
|
||||
.as_mut()
|
||||
.d_blocks
|
||||
.0
|
||||
.d_vec
|
||||
.as_mut_slice(
|
||||
lwe_size * (counter_num_blocks - 1)..lwe_size * counter_num_blocks,
|
||||
0,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
carry_blocks_last.copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
|
||||
carry_blocks.as_mut().info.blocks.last_mut().unwrap().degree =
|
||||
Degree(self.message_modulus.0 - 1);
|
||||
carry_blocks
|
||||
.as_mut()
|
||||
.info
|
||||
.blocks
|
||||
.last_mut()
|
||||
.unwrap()
|
||||
.noise_level = NoiseLevel::ZERO;
|
||||
|
||||
self.apply_lookup_table_async(
|
||||
carry_blocks.as_mut(),
|
||||
result.as_ref(),
|
||||
@@ -527,10 +503,43 @@ impl CudaServerKey {
|
||||
streams,
|
||||
);
|
||||
|
||||
let mut rotated_carry_blocks: CudaSignedRadixCiphertext =
|
||||
self.create_trivial_zero_radix(counter_num_blocks, streams);
|
||||
|
||||
let mut rotated_slice = rotated_carry_blocks
|
||||
.as_mut()
|
||||
.d_blocks
|
||||
.0
|
||||
.d_vec
|
||||
.as_mut_slice(0..(counter_num_blocks) * lwe_size, 0)
|
||||
.unwrap();
|
||||
|
||||
let first_block;
|
||||
let last_blocks;
|
||||
(first_block, last_blocks) = rotated_slice.split_at_mut(lwe_size, 0);
|
||||
|
||||
let mut tmp_carry_blocks3 = carry_blocks.duplicate(streams);
|
||||
let carry_slice = tmp_carry_blocks3
|
||||
.as_mut()
|
||||
.d_blocks
|
||||
.0
|
||||
.d_vec
|
||||
.as_mut_slice(0..(counter_num_blocks - 1) * lwe_size, 0)
|
||||
.unwrap();
|
||||
|
||||
last_blocks
|
||||
.unwrap()
|
||||
.copy_from_gpu_async(&carry_slice, streams, 0);
|
||||
first_block
|
||||
.unwrap()
|
||||
.copy_from_gpu_async(&trivial_last_block_slice, streams, 0);
|
||||
let mut ciphertexts = Vec::<CudaSignedRadixCiphertext>::with_capacity(3);
|
||||
|
||||
for info in &mut rotated_carry_blocks.ciphertext.info.blocks {
|
||||
info.degree = Degree(self.message_modulus.0 - 1);
|
||||
}
|
||||
ciphertexts.push(message_blocks);
|
||||
ciphertexts.push(carry_blocks);
|
||||
ciphertexts.push(rotated_carry_blocks);
|
||||
|
||||
let trivial_ct: CudaSignedRadixCiphertext =
|
||||
self.create_trivial_radix_async(2u32, counter_num_blocks, streams);
|
||||
|
||||
@@ -1243,6 +1243,14 @@ impl CudaServerKey {
|
||||
.unwrap();
|
||||
let mut generates_or_propagates_degrees = vec![0; num_blocks];
|
||||
let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
|
||||
for (i, block_index) in (block_range.clone()).enumerate() {
|
||||
generates_or_propagates_degrees[i] =
|
||||
generates_or_propagates.info.blocks[block_index].degree.0;
|
||||
generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks
|
||||
[block_index]
|
||||
.noise_level
|
||||
.0;
|
||||
}
|
||||
let ct_modulus = output.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let mut output_slice = output
|
||||
.d_blocks
|
||||
|
||||
@@ -323,9 +323,16 @@ impl CudaServerKey {
|
||||
}
|
||||
};
|
||||
|
||||
let decomposed_scalar = BlockDecomposer::with_early_stop_at_zero(scalar, 1)
|
||||
.iter_as::<u64>()
|
||||
.collect::<Vec<_>>();
|
||||
if decomposed_scalar.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
let scalar_mul_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
decomposed_scalar.as_slice(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
@@ -345,6 +352,7 @@ impl CudaServerKey {
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
decomposed_scalar.as_slice(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
|
||||
Reference in New Issue
Block a user