mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
chore(cuda): format sources and add check in ci
This commit is contained in:
@@ -8,171 +8,93 @@ extern "C" {
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size);
|
||||
uint32_t gpu_index,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t l_gadget,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size);
|
||||
uint32_t gpu_index,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t l_gadget,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *test_vector,
|
||||
void *test_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_cmux_tree_32(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory);
|
||||
void cuda_cmux_tree_32(void *v_stream, void *glwe_out, void *ggsw_in,
|
||||
void *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t r,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_cmux_tree_64(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
|
||||
|
||||
void cuda_extract_bits_32(
|
||||
void *v_stream,
|
||||
void *list_lwe_out,
|
||||
void *lwe_in,
|
||||
void *lwe_in_buffer,
|
||||
void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer,
|
||||
void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs,
|
||||
void *lut_vector_indexes,
|
||||
void *ksk,
|
||||
void *fourier_bsk,
|
||||
uint32_t number_of_bits,
|
||||
uint32_t delta_log,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk,
|
||||
uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk,
|
||||
uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples);
|
||||
|
||||
|
||||
void cuda_extract_bits_64(
|
||||
void *v_stream,
|
||||
void *list_lwe_out,
|
||||
void *lwe_in,
|
||||
void *lwe_in_buffer,
|
||||
void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer,
|
||||
void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs,
|
||||
void *lut_vector_indexes,
|
||||
void *ksk,
|
||||
void *fourier_bsk,
|
||||
uint32_t number_of_bits,
|
||||
uint32_t delta_log,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk,
|
||||
uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk,
|
||||
uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples);
|
||||
void cuda_cmux_tree_64(void *v_stream, void *glwe_out, void *ggsw_in,
|
||||
void *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t r,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_extract_bits_32(void *v_stream, void *list_lwe_out, void *lwe_in,
|
||||
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs, void *lut_vector_indexes, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples);
|
||||
|
||||
void cuda_extract_bits_64(void *v_stream, void *list_lwe_out, void *lwe_in,
|
||||
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs, void *lut_vector_indexes, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples);
|
||||
};
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
#endif
|
||||
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
|
||||
@@ -5,20 +5,15 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples);
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
|
||||
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples);
|
||||
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
|
||||
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -58,64 +58,57 @@
|
||||
*/
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", glwe_dimension == 1));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -123,64 +116,57 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
}
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
|
||||
assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", glwe_dimension == 1));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -54,18 +54,10 @@ template <typename Torus, class params, sharedMemDegree SMD>
|
||||
* is not FULLSM
|
||||
*/
|
||||
__global__ void device_bootstrap_amortized(
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
char *device_mem,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t lwe_idx,
|
||||
size_t device_memory_size_per_sample) {
|
||||
Torus *lwe_out, Torus *lut_vector, uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in, double2 *bootstrapping_key, char *device_mem,
|
||||
uint32_t lwe_mask_size, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t lwe_idx, size_t device_memory_size_per_sample) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
@@ -103,8 +95,8 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
|
||||
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
|
||||
2];
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
@@ -114,11 +106,11 @@ __global__ void device_bootstrap_amortized(
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask, block_lut_vector, b_hat, false);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
params::degree / params::opt>(
|
||||
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
|
||||
|
||||
// Loop over all the mask elements of the sample to accumulate
|
||||
@@ -147,11 +139,11 @@ __global__ void device_bootstrap_amortized(
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask_rotated, base_log, l_gadget);
|
||||
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
params::degree / params::opt>(
|
||||
accumulator_body_rotated, base_log, l_gadget);
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
@@ -195,13 +187,11 @@ __global__ void device_bootstrap_amortized(
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -230,7 +220,7 @@ __global__ void device_bootstrap_amortized(
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -305,20 +295,11 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_amortized(
|
||||
void *v_stream,
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
void *v_stream, Torus *lwe_out, Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes, Torus *lwe_in, double2 *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
|
||||
sizeof(Torus) * polynomial_size + // accumulator body
|
||||
@@ -354,28 +335,24 @@ __host__ void host_bootstrap_amortized(
|
||||
// from one of three templates (no use, partial use or full use
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes, lwe_in,
|
||||
bootstrapping_key, d_mem,
|
||||
input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx, DM_FULL);
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget,
|
||||
lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_PART);
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
|
||||
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes,
|
||||
lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx,
|
||||
DM_PART);
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget,
|
||||
lwe_idx, DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
// address the full capacity of shared memory. Shared memory on the
|
||||
@@ -384,26 +361,22 @@ __host__ void host_bootstrap_amortized(
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_FULL));
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, 0));
|
||||
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes,
|
||||
lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx,
|
||||
0);
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget,
|
||||
lwe_idx, 0);
|
||||
}
|
||||
// Synchronize the streams before copying the result to lwe_out at the right
|
||||
// place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cudaFree(d_mem);
|
||||
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
@@ -415,8 +388,8 @@ int cuda_get_pbs_per_gpu(int polynomial_size) {
|
||||
cudaDeviceProp device_properties;
|
||||
cudaGetDeviceProperties(&device_properties, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,
|
||||
num_threads, 0);
|
||||
&blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
|
||||
0);
|
||||
|
||||
return device_properties.multiProcessorCount * blocks_per_sm;
|
||||
}
|
||||
|
||||
@@ -57,76 +57,66 @@
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
|
||||
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
|
||||
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * l_gadget",
|
||||
num_samples <= number_of_sm / 4. / 2. / l_gadget));
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 16",
|
||||
base_log <= 16));
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048));
|
||||
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"l_gadget",
|
||||
num_samples <= number_of_sm / 4. / 2. / l_gadget));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -134,79 +124,68 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
}
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
|
||||
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
|
||||
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
|
||||
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * l_gadget",
|
||||
num_samples <= number_of_sm / 4. / 2. / l_gadget));
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 16",
|
||||
base_log <= 16));
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048));
|
||||
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
|
||||
"or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"l_gadget",
|
||||
num_samples <= number_of_sm / 4. / 2. / l_gadget));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, num_samples, num_lut_vectors);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,17 +29,13 @@ namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void
|
||||
mul_trgsw_trlwe(Torus *accumulator,
|
||||
double2 *fft,
|
||||
int16_t *trlwe_decomposed,
|
||||
double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer,
|
||||
double2 *bootstrapping_key,
|
||||
int polynomial_size, int l_gadget, int iteration, grid_group &grid) {
|
||||
mul_trgsw_trlwe(Torus *accumulator, double2 *fft, int16_t *trlwe_decomposed,
|
||||
double2 *mask_join_buffer, double2 *body_join_buffer,
|
||||
double2 *bootstrapping_key, int polynomial_size, int l_gadget,
|
||||
int iteration, grid_group &grid) {
|
||||
|
||||
// Put the decomposed TRLWE sample in the Fourier domain
|
||||
real_to_complex_compressed<int16_t, params>(trlwe_decomposed,
|
||||
fft);
|
||||
real_to_complex_compressed<int16_t, params>(trlwe_decomposed, fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
@@ -49,52 +45,49 @@ mul_trgsw_trlwe(Torus *accumulator,
|
||||
correction_direct_fft_inplace<params>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
|
||||
blockIdx.x, polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
|
||||
blockIdx.x, polynomial_size, 1, l_gadget));
|
||||
|
||||
// Perform the matrix multiplication between the RGSW and the TRLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
|
||||
auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
|
||||
auto first_processed_bsk =
|
||||
(blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
|
||||
auto second_processed_bsk =
|
||||
(blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
|
||||
|
||||
auto first_processed_acc = (blockIdx.y == 0) ?
|
||||
&mask_join_buffer[params::degree / 2 * blockIdx.x] :
|
||||
&body_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
auto second_processed_acc = (blockIdx.y == 0) ?
|
||||
&body_join_buffer[params::degree / 2 * blockIdx.x] :
|
||||
&mask_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
auto first_processed_acc =
|
||||
(blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x]
|
||||
: &body_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
auto second_processed_acc =
|
||||
(blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x]
|
||||
: &mask_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
//first product
|
||||
for(int i = 0; i < params::opt / 2; i++) {
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
// first product
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
tid = threadIdx.x;
|
||||
//second product
|
||||
for(int i = 0; i < params::opt / 2; i++) {
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// second product
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
@@ -102,24 +95,24 @@ mul_trgsw_trlwe(Torus *accumulator,
|
||||
// values needed from every other block
|
||||
grid.sync();
|
||||
|
||||
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
|
||||
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
auto cur_src_acc = &src_acc[l * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
@@ -142,49 +135,46 @@ template <typename Torus, class params>
|
||||
* Kernel launched by the low latency version of the
|
||||
* bootstrapping, that uses cooperative groups
|
||||
* lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples
|
||||
* lut_vector - vector of look up tables with length polynomial_size * num_samples
|
||||
* lut_vector_indexes - mapping between lwe_in and lut_vector
|
||||
* lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
|
||||
* lut_vector - vector of look up tables with length polynomial_size *
|
||||
* num_samples lut_vector_indexes - mapping between lwe_in and lut_vector lwe_in
|
||||
* - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
|
||||
*
|
||||
*/
|
||||
__global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget
|
||||
) {
|
||||
Torus *lwe_out, Torus *lut_vector, Torus *lwe_in,
|
||||
double2 *bootstrapping_key, double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer, uint32_t lwe_mask_size, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t l_gadget) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
char* selected_memory = sharedmem;
|
||||
char *selected_memory = sharedmem;
|
||||
|
||||
int16_t *accumulator_decomposed = (int16_t *)selected_memory;
|
||||
Torus *accumulator = (Torus*)accumulator_decomposed +
|
||||
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
|
||||
double2 *accumulator_fft = (double2*)accumulator +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
Torus *accumulator = (Torus *)accumulator_decomposed +
|
||||
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
|
||||
// Reuse memory from accumulator_fft for accumulator_rotated
|
||||
Torus* accumulator_rotated = (Torus*)accumulator_fft;
|
||||
Torus *accumulator_rotated = (Torus *)accumulator_fft;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)];
|
||||
|
||||
auto block_lut_vector =
|
||||
&lut_vector[blockIdx.z * params::degree * 2];
|
||||
auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2];
|
||||
|
||||
auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
auto block_mask_join_buffer =
|
||||
&mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
auto block_body_join_buffer =
|
||||
&body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
@@ -192,19 +182,17 @@ __global__ void device_bootstrap_low_latency(
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = rescale_torus_element(
|
||||
block_lwe_in[lwe_mask_size],
|
||||
2 * params::degree);
|
||||
Torus b_hat =
|
||||
rescale_torus_element(block_lwe_in[lwe_mask_size], 2 * params::degree);
|
||||
|
||||
if (blockIdx.y == 0) {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false);
|
||||
}
|
||||
else {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false);
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false);
|
||||
} else {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false);
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_mask_size; i++) {
|
||||
@@ -217,15 +205,14 @@ __global__ void device_bootstrap_low_latency(
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, l_gadget);
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, l_gadget);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
@@ -239,15 +226,11 @@ __global__ void device_bootstrap_low_latency(
|
||||
synchronize_threads_in_block();
|
||||
// Perform G^-1(ACC) * RGSW -> TRLWE
|
||||
mul_trgsw_trlwe<Torus, params>(
|
||||
accumulator,
|
||||
accumulator_fft,
|
||||
accumulator_decomposed,
|
||||
block_mask_join_buffer,
|
||||
block_body_join_buffer,
|
||||
bootstrapping_key,
|
||||
accumulator, accumulator_fft, accumulator_decomposed,
|
||||
block_mask_join_buffer, block_body_join_buffer, bootstrapping_key,
|
||||
polynomial_size, l_gadget, i, grid);
|
||||
}
|
||||
|
||||
|
||||
auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y == 0) {
|
||||
@@ -258,41 +241,31 @@ __global__ void device_bootstrap_low_latency(
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Host wrapper to the low latency version
|
||||
* of bootstrapping
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
void *v_stream,
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors) {
|
||||
void *v_stream, Torus *lwe_out, Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes, Torus *lwe_in, double2 *bootstrapping_key,
|
||||
uint32_t lwe_mask_size, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
|
||||
int buffer_size_per_gpu =
|
||||
l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft;
|
||||
double2 *body_buffer_fft;
|
||||
checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
|
||||
checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
|
||||
|
||||
|
||||
int bytes_needed =
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_decomp
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(l_gadget, 2, num_samples);
|
||||
@@ -307,17 +280,18 @@ __host__ void host_bootstrap_low_latency(
|
||||
kernel_args[6] = &lwe_mask_size;
|
||||
kernel_args[7] = &polynomial_size;
|
||||
kernel_args[8] = &base_log;
|
||||
kernel_args[9] =&l_gadget;
|
||||
kernel_args[9] = &l_gadget;
|
||||
|
||||
checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency<Torus,
|
||||
params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
bytes_needed));
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, bytes_needed));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency<Torus, params>, grid, thds, (void**)kernel_args, bytes_needed, *stream )) ;
|
||||
|
||||
cudaFuncCachePreferShared);
|
||||
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params>, grid, thds,
|
||||
(void **)kernel_args, bytes_needed, *stream));
|
||||
|
||||
// Synchronize the streams before copying the result to lwe_out at the right
|
||||
// place
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
@@ -1,167 +1,144 @@
|
||||
#include "bootstrap_wop.cuh"
|
||||
|
||||
void cuda_cmux_tree_32(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
void cuda_cmux_tree_32(void *v_stream, void *glwe_out, void *ggsw_in,
|
||||
void *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
|
||||
r >= 1));
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
|
||||
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
|
||||
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
|
||||
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
|
||||
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
|
||||
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_cmux_tree_64(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
void cuda_cmux_tree_64(void *v_stream, void *glwe_out, void *ggsw_in,
|
||||
void *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
|
||||
r >= 1));
|
||||
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
|
||||
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
|
||||
"2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
// For larger k we will need to adjust the mask size
|
||||
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
|
||||
"be >= 1 ",
|
||||
r >= 1));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
|
||||
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
|
||||
l_gadget, r, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void cuda_extract_bits_32(
|
||||
void *v_stream,
|
||||
void *list_lwe_out,
|
||||
void *lwe_in,
|
||||
void *lwe_in_buffer,
|
||||
void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer,
|
||||
void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs,
|
||||
void *lut_vector_indexes,
|
||||
void *ksk,
|
||||
void *fourier_bsk,
|
||||
uint32_t number_of_bits,
|
||||
uint32_t delta_log,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk,
|
||||
uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk,
|
||||
uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples)
|
||||
{
|
||||
assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
|
||||
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
|
||||
lwe_dimension_before == 2048));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
|
||||
void cuda_extract_bits_32(void *v_stream, void *list_lwe_out, void *lwe_in,
|
||||
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs, void *lut_vector_indexes, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 16",
|
||||
base_log_bsk <= 16));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_before should be one of "
|
||||
"512, 1024, 2048",
|
||||
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
|
||||
lwe_dimension_before == 2048));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"l_gadget_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
|
||||
|
||||
switch (lwe_dimension_before) {
|
||||
case 512:
|
||||
@@ -170,9 +147,9 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer,
|
||||
(uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer,
|
||||
(uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
|
||||
base_log_ksk, l_gadget_ksk, number_of_samples);
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
|
||||
l_gadget_ksk, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint32_t, Degree<1024>>(
|
||||
@@ -180,9 +157,9 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer,
|
||||
(uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer,
|
||||
(uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
|
||||
base_log_ksk, l_gadget_ksk, number_of_samples);
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
|
||||
l_gadget_ksk, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint32_t, Degree<2048>>(
|
||||
@@ -190,55 +167,44 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer,
|
||||
(uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer,
|
||||
(uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
|
||||
base_log_ksk, l_gadget_ksk, number_of_samples);
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
|
||||
l_gadget_ksk, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void cuda_extract_bits_64(
|
||||
void *v_stream,
|
||||
void *list_lwe_out,
|
||||
void *lwe_in,
|
||||
void *lwe_in_buffer,
|
||||
void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer,
|
||||
void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs,
|
||||
void *lut_vector_indexes,
|
||||
void *ksk,
|
||||
void *fourier_bsk,
|
||||
uint32_t number_of_bits,
|
||||
uint32_t delta_log,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk,
|
||||
uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk,
|
||||
uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples)
|
||||
{
|
||||
assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
|
||||
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
|
||||
lwe_dimension_before == 2048));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
|
||||
void cuda_extract_bits_64(void *v_stream, void *list_lwe_out, void *lwe_in,
|
||||
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
|
||||
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
|
||||
void *lut_pbs, void *lut_vector_indexes, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
|
||||
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 16",
|
||||
base_log_bsk <= 16));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_before should be one of "
|
||||
"512, 1024, 2048",
|
||||
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
|
||||
lwe_dimension_before == 2048));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
|
||||
"equal to the "
|
||||
"number of streaming multiprocessors on the device divided by 8 * "
|
||||
"l_gadget_bsk",
|
||||
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
|
||||
|
||||
switch (lwe_dimension_before) {
|
||||
case 512:
|
||||
@@ -247,9 +213,9 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer,
|
||||
(uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer,
|
||||
(uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
|
||||
base_log_ksk, l_gadget_ksk, number_of_samples);
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
|
||||
l_gadget_ksk, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint64_t, Degree<1024>>(
|
||||
@@ -257,9 +223,9 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer,
|
||||
(uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer,
|
||||
(uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
|
||||
base_log_ksk, l_gadget_ksk, number_of_samples);
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
|
||||
l_gadget_ksk, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint64_t, Degree<2048>>(
|
||||
@@ -267,14 +233,11 @@ void cuda_extract_bits_64(
|
||||
(uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer,
|
||||
(uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer,
|
||||
(uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
|
||||
base_log_ksk, l_gadget_ksk, number_of_samples);
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
|
||||
l_gadget_ksk, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,79 +5,80 @@
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "keyswitch.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include "keyswitch.cuh"
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
|
||||
template <typename T, class params>
|
||||
__device__ void fft(double2 *output, T *input){
|
||||
synchronize_threads_in_block();
|
||||
__device__ void fft(double2 *output, T *input) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<T, params>(input, output);
|
||||
synchronize_threads_in_block();
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<T, params>(input, output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(output);
|
||||
synchronize_threads_in_block();
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(output);
|
||||
synchronize_threads_in_block();
|
||||
correction_direct_fft_inplace<params>(output);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <typename T, typename ST, class params>
|
||||
__device__ void fft(double2 *output, T *input){
|
||||
synchronize_threads_in_block();
|
||||
__device__ void fft(double2 *output, T *input) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<T, ST, params>(input, output);
|
||||
synchronize_threads_in_block();
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<T, ST, params>(input, output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(output);
|
||||
synchronize_threads_in_block();
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(output);
|
||||
synchronize_threads_in_block();
|
||||
correction_direct_fft_inplace<params>(output);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ void ifft_inplace(double2 *data){
|
||||
synchronize_threads_in_block();
|
||||
template <class params> __device__ void ifft_inplace(double2 *data) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(data);
|
||||
synchronize_threads_in_block();
|
||||
correction_inverse_fft_inplace<params>(data);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(data);
|
||||
synchronize_threads_in_block();
|
||||
NSMFFT_inverse<HalfDegree<params>>(data);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this array,
|
||||
* and an array of GGSW ciphertexts with a index to one ciphertext in it. Compute a CMUX with these
|
||||
* operands and writes the output to a particular index of glwe_out.
|
||||
* Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this
|
||||
* array, and an array of GGSW ciphertexts with a index to one ciphertext in it.
|
||||
* Compute a CMUX with these operands and writes the output to a particular
|
||||
* index of glwe_out.
|
||||
*
|
||||
* This function needs polynomial_size threads per block.
|
||||
*
|
||||
* - glwe_out: An array where the result should be written to.
|
||||
* - glwe_in: An array where the GLWE inputs are stored.
|
||||
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
|
||||
* - selected_memory: An array to be used for the accumulators. Can be in the shared memory or
|
||||
* global memory.
|
||||
* - output_idx: The index of the output where the glwe ciphertext should be written.
|
||||
* - selected_memory: An array to be used for the accumulators. Can be in the
|
||||
* shared memory or global memory.
|
||||
* - output_idx: The index of the output where the glwe ciphertext should be
|
||||
* written.
|
||||
* - input_idx1: The index of the first glwe ciphertext we will use.
|
||||
* - input_idx2: The index of the second glwe ciphertext we will use.
|
||||
* - glwe_dim: This is k.
|
||||
@@ -87,154 +88,146 @@ __device__ void ifft_inplace(double2 *data){
|
||||
* - ggsw_idx: The index of the GGSW we will use.
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__device__ void cmux(
|
||||
Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in, char *selected_memory,
|
||||
uint32_t output_idx, uint32_t input_idx1, uint32_t input_idx2,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t ggsw_idx){
|
||||
__device__ void cmux(Torus *glwe_out, Torus *glwe_in, double2 *ggsw_in,
|
||||
char *selected_memory, uint32_t output_idx,
|
||||
uint32_t input_idx1, uint32_t input_idx2,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t ggsw_idx) {
|
||||
|
||||
// Define glwe_sub
|
||||
Torus *glwe_sub_mask = (Torus *) selected_memory;
|
||||
Torus *glwe_sub_body = (Torus *) glwe_sub_mask + (ptrdiff_t)polynomial_size;
|
||||
// Define glwe_sub
|
||||
Torus *glwe_sub_mask = (Torus *)selected_memory;
|
||||
Torus *glwe_sub_body = (Torus *)glwe_sub_mask + (ptrdiff_t)polynomial_size;
|
||||
|
||||
int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size);
|
||||
int16_t *glwe_body_decomposed =
|
||||
int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size);
|
||||
int16_t *glwe_body_decomposed =
|
||||
(int16_t *)glwe_mask_decomposed + (ptrdiff_t)polynomial_size;
|
||||
|
||||
double2 *mask_res_fft = (double2 *)(glwe_body_decomposed +
|
||||
polynomial_size);
|
||||
double2 *body_res_fft =
|
||||
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
|
||||
double2 *mask_res_fft = (double2 *)(glwe_body_decomposed + polynomial_size);
|
||||
double2 *body_res_fft =
|
||||
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
|
||||
|
||||
double2 *glwe_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
double2 *glwe_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
/////////////////////////////////////
|
||||
/////////////////////////////////////
|
||||
|
||||
// glwe2-glwe1
|
||||
// glwe2-glwe1
|
||||
|
||||
// Copy m0 to shared memory to preserve data
|
||||
auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
|
||||
auto m0_body = m0_mask + polynomial_size;
|
||||
// Copy m0 to shared memory to preserve data
|
||||
auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
|
||||
auto m0_body = m0_mask + polynomial_size;
|
||||
|
||||
// Just gets the pointer for m1 on global memory
|
||||
auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
|
||||
auto m1_body = m1_mask + polynomial_size;
|
||||
// Just gets the pointer for m1 on global memory
|
||||
auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
|
||||
auto m1_body = m1_mask + polynomial_size;
|
||||
|
||||
// Mask
|
||||
sub_polynomial<Torus, params>(
|
||||
glwe_sub_mask, m1_mask, m0_mask
|
||||
);
|
||||
// Body
|
||||
sub_polynomial<Torus, params>(
|
||||
glwe_sub_body, m1_body, m0_body
|
||||
);
|
||||
// Mask
|
||||
sub_polynomial<Torus, params>(glwe_sub_mask, m1_mask, m0_mask);
|
||||
// Body
|
||||
sub_polynomial<Torus, params>(glwe_sub_body, m1_body, m0_body);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
mask_res_fft[pos].x = 0;
|
||||
mask_res_fft[pos].y = 0;
|
||||
body_res_fft[pos].x = 0;
|
||||
body_res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Subtract each glwe operand, decompose the resulting
|
||||
// polynomial coefficients to multiply each decomposed level
|
||||
// with the corresponding part of the LUT
|
||||
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
|
||||
|
||||
// Decomposition
|
||||
gadget.decompose_one_level(glwe_mask_decomposed, glwe_sub_mask,
|
||||
decomp_level);
|
||||
gadget.decompose_one_level(glwe_body_decomposed, glwe_sub_body,
|
||||
decomp_level);
|
||||
|
||||
// First, perform the polynomial multiplication for the mask
|
||||
synchronize_threads_in_block();
|
||||
fft<int16_t, params>(glwe_fft, glwe_mask_decomposed);
|
||||
|
||||
// External product and accumulate
|
||||
// Get the piece necessary for the multiplication
|
||||
auto mask_fourier =
|
||||
get_ith_mask_kth_block(ggsw_in, ggsw_idx, 0, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
auto body_fourier =
|
||||
get_ith_body_kth_block(ggsw_in, ggsw_idx, 0, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
mask_res_fft[pos].x = 0;
|
||||
mask_res_fft[pos].y = 0;
|
||||
body_res_fft[pos].x = 0;
|
||||
body_res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Subtract each glwe operand, decompose the resulting
|
||||
// polynomial coefficients to multiply each decomposed level
|
||||
// with the corresponding part of the LUT
|
||||
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
|
||||
|
||||
// Decomposition
|
||||
gadget.decompose_one_level(glwe_mask_decomposed,
|
||||
glwe_sub_mask,
|
||||
decomp_level);
|
||||
gadget.decompose_one_level(glwe_body_decomposed,
|
||||
glwe_sub_body,
|
||||
decomp_level);
|
||||
|
||||
// First, perform the polynomial multiplication for the mask
|
||||
synchronize_threads_in_block();
|
||||
fft<int16_t, params>(glwe_fft, glwe_mask_decomposed);
|
||||
|
||||
// External product and accumulate
|
||||
// Get the piece necessary for the multiplication
|
||||
auto mask_fourier = get_ith_mask_kth_block(
|
||||
ggsw_in, ggsw_idx, 0, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
auto body_fourier = get_ith_body_kth_block(
|
||||
ggsw_in, ggsw_idx, 0, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product
|
||||
synchronize_threads_in_block();
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, glwe_fft, mask_fourier);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, glwe_fft, body_fourier);
|
||||
|
||||
// Now handle the polynomial multiplication for the body
|
||||
// in the same way
|
||||
synchronize_threads_in_block();
|
||||
fft<int16_t, params>(glwe_fft, glwe_body_decomposed);
|
||||
|
||||
// External product and accumulate
|
||||
// Get the piece necessary for the multiplication
|
||||
mask_fourier = get_ith_mask_kth_block(
|
||||
ggsw_in, ggsw_idx, 1, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
body_fourier = get_ith_body_kth_block(
|
||||
ggsw_in, ggsw_idx, 1, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, glwe_fft, mask_fourier);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, glwe_fft, body_fourier);
|
||||
|
||||
}
|
||||
|
||||
// IFFT
|
||||
// Perform the coefficient-wise product
|
||||
synchronize_threads_in_block();
|
||||
ifft_inplace<params>(mask_res_fft);
|
||||
ifft_inplace<params>(body_res_fft);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, glwe_fft, mask_fourier);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, glwe_fft, body_fourier);
|
||||
|
||||
// Now handle the polynomial multiplication for the body
|
||||
// in the same way
|
||||
synchronize_threads_in_block();
|
||||
fft<int16_t, params>(glwe_fft, glwe_body_decomposed);
|
||||
|
||||
// External product and accumulate
|
||||
// Get the piece necessary for the multiplication
|
||||
mask_fourier = get_ith_mask_kth_block(ggsw_in, ggsw_idx, 1, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
body_fourier = get_ith_body_kth_block(ggsw_in, ggsw_idx, 1, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Write the output
|
||||
Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size];
|
||||
Torus *mb_body = mb_mask + polynomial_size;
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, glwe_fft, mask_fourier);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, glwe_fft, body_fourier);
|
||||
}
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for(int i = 0; i < params::opt; i++){
|
||||
mb_mask[tid] = m0_mask[tid];
|
||||
mb_body[tid] = m0_body[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
// IFFT
|
||||
synchronize_threads_in_block();
|
||||
ifft_inplace<params>(mask_res_fft);
|
||||
ifft_inplace<params>(body_res_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(mask_res_fft, mb_mask);
|
||||
add_to_torus<Torus, params>(body_res_fft, mb_body);
|
||||
// Write the output
|
||||
Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size];
|
||||
Torus *mb_body = mb_mask + polynomial_size;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
mb_mask[tid] = m0_mask[tid];
|
||||
mb_body[tid] = m0_body[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
add_to_torus<Torus, params>(mask_res_fft, mb_mask);
|
||||
add_to_torus<Torus, params>(body_res_fft, mb_body);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW ciphertext.
|
||||
* The GLWE ciphertexts are picked two-by-two in sequence. Each thread block computes a single CMUX.
|
||||
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW
|
||||
* ciphertext. The GLWE ciphertexts are picked two-by-two in sequence. Each
|
||||
* thread block computes a single CMUX.
|
||||
*
|
||||
* - glwe_out: An array where the result should be written to.
|
||||
* - glwe_in: An array where the GLWE inputs are stored.
|
||||
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
|
||||
* - device_mem: An pointer for the global memory in case the shared memory is not big enough to
|
||||
* store the accumulators.
|
||||
* - device_memory_size_per_block: Memory size needed to store all accumulators for a single block.
|
||||
* - device_mem: An pointer for the global memory in case the shared memory is
|
||||
* not big enough to store the accumulators.
|
||||
* - device_memory_size_per_block: Memory size needed to store all accumulators
|
||||
* for a single block.
|
||||
* - glwe_dim: This is k.
|
||||
* - polynomial_size: size of the polynomials. This is N.
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
@@ -242,34 +235,29 @@ __device__ void cmux(
|
||||
* - ggsw_idx: The index of the GGSW we will use.
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_batch_cmux(
|
||||
Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in,
|
||||
char *device_mem, size_t device_memory_size_per_block,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t ggsw_idx){
|
||||
__global__ void
|
||||
device_batch_cmux(Torus *glwe_out, Torus *glwe_in, double2 *ggsw_in,
|
||||
char *device_mem, size_t device_memory_size_per_block,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t ggsw_idx) {
|
||||
|
||||
int cmux_idx = blockIdx.x;
|
||||
int output_idx = cmux_idx;
|
||||
int input_idx1 = (cmux_idx << 1);
|
||||
int input_idx2 = (cmux_idx << 1) + 1;
|
||||
int cmux_idx = blockIdx.x;
|
||||
int output_idx = cmux_idx;
|
||||
int input_idx1 = (cmux_idx << 1);
|
||||
int input_idx2 = (cmux_idx << 1) + 1;
|
||||
|
||||
// We use shared memory for intermediate result
|
||||
extern __shared__ char sharedmem[];
|
||||
char *selected_memory;
|
||||
// We use shared memory for intermediate result
|
||||
extern __shared__ char sharedmem[];
|
||||
char *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block];
|
||||
|
||||
cmux<Torus, STorus, params>(
|
||||
glwe_out, glwe_in, ggsw_in,
|
||||
selected_memory,
|
||||
output_idx, input_idx1, input_idx2,
|
||||
glwe_dim, polynomial_size,
|
||||
base_log, l_gadget,
|
||||
ggsw_idx);
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block];
|
||||
|
||||
cmux<Torus, STorus, params>(glwe_out, glwe_in, ggsw_in, selected_memory,
|
||||
output_idx, input_idx1, input_idx2, glwe_dim,
|
||||
polynomial_size, base_log, l_gadget, ggsw_idx);
|
||||
}
|
||||
/*
|
||||
* This kernel executes the CMUX tree used by the hybrid packing of the WoPBS.
|
||||
@@ -279,242 +267,222 @@ __global__ void device_batch_cmux(
|
||||
* - v_stream: The CUDA stream that should be used.
|
||||
* - glwe_out: A device array for the output GLWE ciphertext.
|
||||
* - ggsw_in: A device array for the GGSW ciphertexts used in each layer.
|
||||
* - lut_vector: A device array for the GLWE ciphertexts used in the first layer.
|
||||
* - lut_vector: A device array for the GLWE ciphertexts used in the first
|
||||
* layer.
|
||||
* - polynomial_size: size of the polynomials. This is N.
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - r: Number of layers in the tree.
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_cmux_tree(
|
||||
void *v_stream,
|
||||
Torus *glwe_out,
|
||||
Torus *ggsw_in,
|
||||
Torus *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
void host_cmux_tree(void *v_stream, Torus *glwe_out, Torus *ggsw_in,
|
||||
Torus *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t l_gadget, uint32_t r, uint32_t max_shared_memory) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
int num_lut = (1<<r);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
int num_lut = (1 << r);
|
||||
|
||||
cuda_initialize_twiddles(polynomial_size, 0);
|
||||
cuda_initialize_twiddles(polynomial_size, 0);
|
||||
|
||||
int memory_needed_per_block =
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_mask
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_body
|
||||
sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed
|
||||
sizeof(int16_t) * polynomial_size + // glwe_body_decomposed
|
||||
sizeof(double2) * polynomial_size/2 + // mask_res_fft
|
||||
sizeof(double2) * polynomial_size/2 + // body_res_fft
|
||||
sizeof(double2) * polynomial_size/2; // glwe_fft
|
||||
int memory_needed_per_block =
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_mask
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_body
|
||||
sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed
|
||||
sizeof(int16_t) * polynomial_size + // glwe_body_decomposed
|
||||
sizeof(double2) * polynomial_size / 2 + // mask_res_fft
|
||||
sizeof(double2) * polynomial_size / 2 + // body_res_fft
|
||||
sizeof(double2) * polynomial_size / 2; // glwe_fft
|
||||
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
//////////////////////
|
||||
// std::cout << "Applying the FFT on m^tree" << std::endl;
|
||||
double2 *d_ggsw_fft_in;
|
||||
int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
//////////////////////
|
||||
double2 *d_ggsw_fft_in;
|
||||
int ggsw_size = r * polynomial_size * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) * l_gadget;
|
||||
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double)));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double), *stream));
|
||||
#endif
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double)));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in,
|
||||
ggsw_size * sizeof(double), *stream));
|
||||
#endif
|
||||
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(
|
||||
v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, l_gadget);
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(v_stream, d_ggsw_fft_in, ggsw_in,
|
||||
r, glwe_dimension,
|
||||
polynomial_size, l_gadget);
|
||||
|
||||
//////////////////////
|
||||
//////////////////////
|
||||
|
||||
// Allocate global memory in case parameters are too large
|
||||
char *d_mem;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaMalloc((void **) &d_mem, memory_needed_per_block * (1 << (r - 1))));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync((void **) &d_mem, memory_needed_per_block * (1 << (r - 1)), *stream));
|
||||
#endif
|
||||
}else{
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
memory_needed_per_block));
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
// Allocate global memory in case parameters are too large
|
||||
char *d_mem;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_mem, memory_needed_per_block * (1 << (r - 1))));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync(
|
||||
(void **)&d_mem, memory_needed_per_block * (1 << (r - 1)), *stream));
|
||||
#endif
|
||||
} else {
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
|
||||
checkCudaErrors(
|
||||
cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// Allocate buffers
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
Torus *d_buffer1, *d_buffer2;
|
||||
// Allocate buffers
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
Torus *d_buffer1, *d_buffer2;
|
||||
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus)));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream));
|
||||
checkCudaErrors(cudaMallocAsync((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream));
|
||||
#endif
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
d_buffer1, lut_vector,
|
||||
num_lut * glwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus)));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus)));
|
||||
#else
|
||||
checkCudaErrors(cudaMallocAsync(
|
||||
(void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream));
|
||||
checkCudaErrors(cudaMallocAsync(
|
||||
(void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream));
|
||||
#endif
|
||||
checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
|
||||
num_lut * glwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
|
||||
Torus *output;
|
||||
// Run the cmux tree
|
||||
for(int layer_idx = 0; layer_idx < r; layer_idx++){
|
||||
output = (layer_idx % 2? d_buffer1 : d_buffer2);
|
||||
Torus *input = (layer_idx % 2? d_buffer2 : d_buffer1);
|
||||
Torus *output;
|
||||
// Run the cmux tree
|
||||
for (int layer_idx = 0; layer_idx < r; layer_idx++) {
|
||||
output = (layer_idx % 2 ? d_buffer1 : d_buffer2);
|
||||
Torus *input = (layer_idx % 2 ? d_buffer2 : d_buffer1);
|
||||
|
||||
int num_cmuxes = (1<<(r-1-layer_idx));
|
||||
dim3 grid(num_cmuxes, 1, 1);
|
||||
int num_cmuxes = (1 << (r - 1 - layer_idx));
|
||||
dim3 grid(num_cmuxes, 1, 1);
|
||||
|
||||
// walks horizontally through the leafs
|
||||
if(max_shared_memory < memory_needed_per_block)
|
||||
device_batch_cmux<Torus, STorus, params, NOSM>
|
||||
<<<grid, thds, memory_needed_per_block, *stream>>>(
|
||||
output, input, d_ggsw_fft_in,
|
||||
d_mem, memory_needed_per_block,
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget,
|
||||
layer_idx // r
|
||||
);
|
||||
else
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>
|
||||
<<<grid, thds, memory_needed_per_block, *stream>>>(
|
||||
output, input, d_ggsw_fft_in,
|
||||
d_mem, memory_needed_per_block,
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget,
|
||||
layer_idx // r
|
||||
);
|
||||
// walks horizontally through the leafs
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
device_batch_cmux<Torus, STorus, params, NOSM>
|
||||
<<<grid, thds, memory_needed_per_block, *stream>>>(
|
||||
output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget,
|
||||
layer_idx // r
|
||||
);
|
||||
else
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>
|
||||
<<<grid, thds, memory_needed_per_block, *stream>>>(
|
||||
output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget,
|
||||
layer_idx // r
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
glwe_out, output, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
glwe_out, output,
|
||||
(glwe_dimension+1) * polynomial_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
|
||||
// We only need synchronization to assert that data is in glwe_out before
|
||||
// returning. Memory release can be added to the stream and processed
|
||||
// later.
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
|
||||
// Free memory
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaFree(d_ggsw_fft_in));
|
||||
checkCudaErrors(cudaFree(d_buffer1));
|
||||
checkCudaErrors(cudaFree(d_buffer2));
|
||||
if(max_shared_memory < memory_needed_per_block)
|
||||
checkCudaErrors(cudaFree(d_mem));
|
||||
#else
|
||||
checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream));
|
||||
checkCudaErrors(cudaFreeAsync(d_buffer1, *stream));
|
||||
checkCudaErrors(cudaFreeAsync(d_buffer2, *stream));
|
||||
if(max_shared_memory < memory_needed_per_block)
|
||||
checkCudaErrors(cudaFreeAsync(d_mem, *stream));
|
||||
#endif
|
||||
// We only need synchronization to assert that data is in glwe_out before
|
||||
// returning. Memory release can be added to the stream and processed
|
||||
// later.
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
|
||||
// Free memory
|
||||
#if (CUDART_VERSION < 11020)
|
||||
checkCudaErrors(cudaFree(d_ggsw_fft_in));
|
||||
checkCudaErrors(cudaFree(d_buffer1));
|
||||
checkCudaErrors(cudaFree(d_buffer2));
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
checkCudaErrors(cudaFree(d_mem));
|
||||
#else
|
||||
checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream));
|
||||
checkCudaErrors(cudaFreeAsync(d_buffer1, *stream));
|
||||
checkCudaErrors(cudaFreeAsync(d_buffer2, *stream));
|
||||
if (max_shared_memory < memory_needed_per_block)
|
||||
checkCudaErrors(cudaFreeAsync(d_mem, *stream));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
// only works for big lwe for ks+bs case
|
||||
// state_lwe_buffer is copied from big lwe input
|
||||
// shifted_lwe_buffer is scalar multiplication of lwe input
|
||||
// blockIdx.x refers to input ciphertext id
|
||||
template <typename Torus, class params>
|
||||
__global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
|
||||
Torus *src, Torus value)
|
||||
{
|
||||
int blockId = blockIdx.x;
|
||||
int tid = threadIdx.x;
|
||||
auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)];
|
||||
auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)];
|
||||
auto cur_src = &src[blockId * (params::degree + 1)];
|
||||
|
||||
Torus *src, Torus value) {
|
||||
int blockId = blockIdx.x;
|
||||
int tid = threadIdx.x;
|
||||
auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)];
|
||||
auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)];
|
||||
auto cur_src = &src[blockId * (params::degree + 1)];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
cur_dst_copy[tid] = cur_src[tid];
|
||||
cur_dst_shift[tid] = cur_src[tid] * value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
cur_dst_copy[tid] = cur_src[tid];
|
||||
cur_dst_shift[tid] = cur_src[tid] * value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == params::degree / params::opt - 1) {
|
||||
cur_dst_copy[params::degree] = cur_src[params::degree];
|
||||
cur_dst_shift[params::degree] = cur_src[params::degree] * value;
|
||||
}
|
||||
if (threadIdx.x == params::degree / params::opt - 1) {
|
||||
cur_dst_copy[params::degree] = cur_src[params::degree];
|
||||
cur_dst_shift[params::degree] = cur_src[params::degree] * value;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// only works for small lwe in ks+bs case
|
||||
// function copies lwe when length is not a power of two
|
||||
template <typename Torus>
|
||||
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, uint32_t number_of_bits,
|
||||
uint32_t lwe_id)
|
||||
{
|
||||
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
|
||||
uint32_t number_of_bits, uint32_t lwe_id) {
|
||||
|
||||
size_t blockId = blockIdx.x;
|
||||
size_t threads_per_block = blockDim.x;
|
||||
size_t opt = small_lwe_size / threads_per_block;
|
||||
size_t rem = small_lwe_size & (threads_per_block - 1);
|
||||
|
||||
auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits];
|
||||
auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size];
|
||||
auto cur_src = &src[blockId * small_lwe_size];
|
||||
|
||||
size_t blockId = blockIdx.x;
|
||||
size_t threads_per_block = blockDim.x;
|
||||
size_t opt = small_lwe_size / threads_per_block;
|
||||
size_t rem = small_lwe_size & (threads_per_block - 1);
|
||||
|
||||
auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits];
|
||||
auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size];
|
||||
auto cur_src = &src[blockId * small_lwe_size];
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = 0; i < opt; i++) {
|
||||
cur_dst[tid] = cur_src[tid];
|
||||
tid += threads_per_block;
|
||||
}
|
||||
|
||||
if (threadIdx.x < rem)
|
||||
cur_dst[tid] = cur_src[tid];
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
for (int i = 0; i < opt; i++) {
|
||||
cur_dst[tid] = cur_src[tid];
|
||||
tid += threads_per_block;
|
||||
}
|
||||
|
||||
if (threadIdx.x < rem)
|
||||
cur_dst[tid] = cur_src[tid];
|
||||
}
|
||||
|
||||
|
||||
// only used in extract bits for one ciphertext
|
||||
// should be called with one block and one thread
|
||||
// NOTE: check if putting this functionality in copy_small_lwe or
|
||||
// fill_pbs_lut vector is faster
|
||||
template <typename Torus>
|
||||
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension,
|
||||
Torus value) {
|
||||
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
|
||||
|
||||
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
|
||||
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Fill lut(only body) for the current bit (equivalent to trivial encryption as
|
||||
// mask is 0s)
|
||||
// The LUT is filled with -alpha in each coefficient where alpha = delta*2^{bit_idx-1}
|
||||
// The LUT is filled with -alpha in each coefficient where alpha =
|
||||
// delta*2^{bit_idx-1}
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value)
|
||||
{
|
||||
Torus *cur_poly = &lut[params::degree];
|
||||
size_t tid = threadIdx.x;
|
||||
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
|
||||
Torus *cur_poly = &lut[params::degree];
|
||||
size_t tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
cur_poly[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
cur_poly[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 if the
|
||||
// extracted bit was 0 and 1 in the other case
|
||||
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
|
||||
// if the extracted bit was 0 and 1 in the other case
|
||||
//
|
||||
// Remove the extracted bit from the state LWE to get a 0 at the extracted bit
|
||||
// location.
|
||||
@@ -525,8 +493,7 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value)
|
||||
template <typename Torus, class params>
|
||||
__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
|
||||
Torus *pbs_lwe_out, Torus add_value,
|
||||
Torus mul_value)
|
||||
{
|
||||
Torus mul_value) {
|
||||
size_t tid = threadIdx.x;
|
||||
size_t blockId = blockIdx.x;
|
||||
auto cur_shifted_lwe = &shifted_lwe[blockId * (params::degree + 1)];
|
||||
@@ -546,81 +513,57 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_extract_bits(
|
||||
void *v_stream,
|
||||
Torus *list_lwe_out,
|
||||
Torus *lwe_in,
|
||||
Torus *lwe_in_buffer,
|
||||
Torus *lwe_in_shifted_buffer,
|
||||
Torus *lwe_out_ks_buffer,
|
||||
Torus *lwe_out_pbs_buffer,
|
||||
Torus *lut_pbs,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *ksk,
|
||||
double2 *fourier_bsk,
|
||||
uint32_t number_of_bits,
|
||||
uint32_t delta_log,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log_bsk,
|
||||
uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk,
|
||||
uint32_t l_gadget_ksk,
|
||||
uint32_t number_of_samples)
|
||||
{
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
|
||||
void *v_stream, Torus *list_lwe_out, Torus *lwe_in, Torus *lwe_in_buffer,
|
||||
Torus *lwe_in_shifted_buffer, Torus *lwe_out_ks_buffer,
|
||||
Torus *lwe_out_pbs_buffer, Torus *lut_pbs, uint32_t *lut_vector_indexes,
|
||||
Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after, uint32_t base_log_bsk, uint32_t l_gadget_bsk,
|
||||
uint32_t base_log_ksk, uint32_t l_gadget_ksk, uint32_t number_of_samples) {
|
||||
|
||||
int blocks = 1;
|
||||
int threads = params::degree / params::opt;
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
|
||||
|
||||
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>
|
||||
(lwe_in_buffer, lwe_in_shifted_buffer, lwe_in,
|
||||
1ll << (ciphertext_n_bits - delta_log - 1));
|
||||
int blocks = 1;
|
||||
int threads = params::degree / params::opt;
|
||||
|
||||
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(v_stream, lwe_out_ks_buffer,
|
||||
lwe_in_shifted_buffer, ksk,
|
||||
lwe_dimension_before,
|
||||
lwe_dimension_after, base_log_ksk,
|
||||
l_gadget_ksk, 1);
|
||||
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
|
||||
lwe_in_buffer, lwe_in_shifted_buffer, lwe_in,
|
||||
1ll << (ciphertext_n_bits - delta_log - 1));
|
||||
|
||||
copy_small_lwe<<<1, 256, 0, *stream>>>(list_lwe_out,
|
||||
lwe_out_ks_buffer,
|
||||
lwe_dimension_after + 1,
|
||||
number_of_bits,
|
||||
number_of_bits - bit_idx - 1);
|
||||
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, lwe_out_ks_buffer, lwe_in_shifted_buffer, ksk,
|
||||
lwe_dimension_before, lwe_dimension_after, base_log_ksk, l_gadget_ksk,
|
||||
1);
|
||||
|
||||
if (bit_idx == number_of_bits - 1) {
|
||||
break;
|
||||
}
|
||||
copy_small_lwe<<<1, 256, 0, *stream>>>(
|
||||
list_lwe_out, lwe_out_ks_buffer, lwe_dimension_after + 1,
|
||||
number_of_bits, number_of_bits - bit_idx - 1);
|
||||
|
||||
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_out_ks_buffer,
|
||||
lwe_dimension_after,
|
||||
1ll << (ciphertext_n_bits - 2));
|
||||
|
||||
|
||||
fill_lut_body_for_current_bit<Torus, params>
|
||||
<<<blocks, threads, 0,*stream>>> (lut_pbs, 0ll - 1ll << (
|
||||
delta_log - 1 +
|
||||
bit_idx));
|
||||
|
||||
host_bootstrap_low_latency<Torus, params>(v_stream, lwe_out_pbs_buffer,
|
||||
lut_pbs, lut_vector_indexes,
|
||||
lwe_out_ks_buffer, fourier_bsk,
|
||||
lwe_dimension_after, lwe_dimension_before,
|
||||
base_log_bsk, l_gadget_bsk, number_of_samples,
|
||||
1);
|
||||
|
||||
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
|
||||
lwe_in_shifted_buffer, lwe_in_buffer, lwe_out_pbs_buffer,
|
||||
1ll << (delta_log - 1 + bit_idx),
|
||||
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) );
|
||||
if (bit_idx == number_of_bits - 1) {
|
||||
break;
|
||||
}
|
||||
|
||||
add_to_body<Torus><<<1, 1, 0, *stream>>>(
|
||||
lwe_out_ks_buffer, lwe_dimension_after, 1ll << (ciphertext_n_bits - 2));
|
||||
|
||||
fill_lut_body_for_current_bit<Torus, params>
|
||||
<<<blocks, threads, 0, *stream>>>(
|
||||
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
|
||||
|
||||
host_bootstrap_low_latency<Torus, params>(
|
||||
v_stream, lwe_out_pbs_buffer, lut_pbs, lut_vector_indexes,
|
||||
lwe_out_ks_buffer, fourier_bsk, lwe_dimension_after,
|
||||
lwe_dimension_before, base_log_bsk, l_gadget_bsk, number_of_samples, 1);
|
||||
|
||||
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
|
||||
lwe_in_shifted_buffer, lwe_in_buffer, lwe_out_pbs_buffer,
|
||||
1ll << (delta_log - 1 + bit_idx),
|
||||
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif //WO_PBS_H
|
||||
#endif // WO_PBS_H
|
||||
|
||||
@@ -10,26 +10,29 @@
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget) {
|
||||
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
l_gadget;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
polynomial_size / 2];
|
||||
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
polynomial_size / 2];
|
||||
}
|
||||
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
|
||||
@@ -65,21 +68,21 @@ void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
|
||||
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t l_gadget,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
int total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
l_gadget;
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * l_gadget;
|
||||
|
||||
// Here the buffer size is the size of double2 times the number of polynomials
|
||||
// times the polynomial size over 2 because the polynomials are compressed
|
||||
// into the complex domain to perform the FFT
|
||||
size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof
|
||||
(double2);
|
||||
size_t buffer_size =
|
||||
total_polynomials * polynomial_size / 2 * sizeof(double2);
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt(polynomial_size);
|
||||
@@ -110,23 +113,23 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 1024:
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 2048:
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 4096:
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 8192:
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -134,44 +137,58 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
|
||||
cudaFree(d_bsk);
|
||||
free(h_bsk);
|
||||
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>((double2 *)dest, (int32_t *)src,
|
||||
v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
uint32_t gpu_index,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t l_gadget,
|
||||
uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
|
||||
(double2 *)dest, (int32_t *)src, v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>((double2 *)dest, (int64_t *)src,
|
||||
v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
uint32_t gpu_index,
|
||||
uint32_t input_lwe_dim,
|
||||
uint32_t glwe_dim, uint32_t l_gadget,
|
||||
uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
|
||||
(double2 *)dest, (int64_t *)src, v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ uint64_t*
|
||||
get_ith_mask_kth_block(uint64_t* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint32_t*
|
||||
get_ith_mask_kth_block(uint32_t* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint64_t*
|
||||
get_ith_body_kth_block(uint64_t *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint32_t*
|
||||
get_ith_body_kth_block(uint32_t *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
|
||||
int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int level,
|
||||
uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
|
||||
@@ -2,51 +2,49 @@
|
||||
#define CONCRETE_CORE_GGSW_CUH
|
||||
|
||||
template <typename T, typename ST, class params>
|
||||
__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src){
|
||||
__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src) {
|
||||
|
||||
extern __shared__ char sharedmem[];
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
double2 *shared_output = (double2*) sharedmem;
|
||||
double2 *shared_output = (double2 *)sharedmem;
|
||||
|
||||
// Compression
|
||||
int offset = blockIdx.x * blockDim.x;
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt >> 1; i++) {
|
||||
ST x = src[(2 * tid) + params::opt * offset];
|
||||
ST y = src[(2 * tid + 1) + params::opt * offset];
|
||||
shared_output[tid].x = x / (double)std::numeric_limits<T>::max();
|
||||
shared_output[tid].y = y / (double)std::numeric_limits<T>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
// Compression
|
||||
int offset = blockIdx.x * blockDim.x;
|
||||
int tid = threadIdx.x;
|
||||
int log_2_opt = params::opt >> 1;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < log_2_opt; i++) {
|
||||
ST x = src[(2 * tid) + params::opt * offset];
|
||||
ST y = src[(2 * tid + 1) + params::opt * offset];
|
||||
shared_output[tid].x = x / (double)std::numeric_limits<T>::max();
|
||||
shared_output[tid].y = y / (double)std::numeric_limits<T>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(shared_output);
|
||||
synchronize_threads_in_block();
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(shared_output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(shared_output);
|
||||
synchronize_threads_in_block();
|
||||
correction_direct_fft_inplace<params>(shared_output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Write the output to global memory
|
||||
tid = threadIdx.x;
|
||||
for (int j = 0; j < params::opt >> 1; j++) {
|
||||
dest[tid + (params::opt >> 1) * offset] = shared_output[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
// Write the output to global memory
|
||||
tid = threadIdx.x;
|
||||
for (int j = 0; j < log_2_opt; j++) {
|
||||
dest[tid + (params::opt >> 1) * offset] = shared_output[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the FFT transform on sequence of GGSW ciphertexts already in the global memory
|
||||
* Applies the FFT transform on sequence of GGSW ciphertexts already in the
|
||||
* global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(
|
||||
void *v_stream,
|
||||
double2 *dest, T *src,
|
||||
uint32_t r,
|
||||
uint32_t glwe_dim,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t l_gadget) {
|
||||
void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size,
|
||||
uint32_t l_gadget) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
@@ -56,11 +54,9 @@ void batch_fft_ggsw_vector(
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / params::opt;
|
||||
|
||||
batch_fft_ggsw_vectors<T, ST, params><<<gridSize, blockSize, shared_memory_size, *stream>>>(dest,
|
||||
src);
|
||||
batch_fft_ggsw_vectors<T, ST, params>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif //CONCRETE_CORE_GGSW_CUH
|
||||
#endif // CONCRETE_CORE_GGSW_CUH
|
||||
|
||||
@@ -62,7 +62,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
// error code: zero copy size
|
||||
return -3;
|
||||
}
|
||||
|
||||
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
@@ -75,8 +75,8 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice,
|
||||
*stream));
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -117,8 +117,8 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost,
|
||||
*stream));
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -14,18 +14,15 @@
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint32_t *>(lwe_out), static_cast<uint32_t *>(lwe_in),
|
||||
static_cast<uint32_t*>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after,
|
||||
base_log, l_gadget,
|
||||
num_samples);
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
|
||||
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint32_t *>(lwe_out),
|
||||
static_cast<uint32_t *>(lwe_in), static_cast<uint32_t *>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after, base_log, l_gadget,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
|
||||
@@ -38,18 +35,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *> (lwe_in),
|
||||
static_cast<uint64_t*>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after,
|
||||
base_log, l_gadget,
|
||||
num_samples);
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
|
||||
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint64_t *>(lwe_out),
|
||||
static_cast<uint64_t *>(lwe_in), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after, base_log, l_gadget,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -9,24 +9,23 @@
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t l_gadget) {
|
||||
int pos = i * l_gadget * (lwe_dimension_after + 1) +
|
||||
level * (lwe_dimension_after + 1);
|
||||
Torus *ptr = &ksk[pos];
|
||||
return ptr;
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t l_gadget) {
|
||||
int pos = i * l_gadget * (lwe_dimension_after + 1) +
|
||||
level * (lwe_dimension_after + 1);
|
||||
Torus *ptr = &ksk[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus decompose_one(Torus &state, Torus mod_b_mask,
|
||||
int base_log) {
|
||||
Torus res = state & mod_b_mask;
|
||||
state >>= base_log;
|
||||
Torus carry = ((res - 1ll) | state) & res;
|
||||
carry >>= base_log - 1;
|
||||
state += carry;
|
||||
res -= carry << base_log;
|
||||
return res;
|
||||
__device__ Torus decompose_one(Torus &state, Torus mod_b_mask, int base_log) {
|
||||
Torus res = state & mod_b_mask;
|
||||
state >>= base_log;
|
||||
Torus carry = ((res - 1ll) | state) & res;
|
||||
carry >>= base_log - 1;
|
||||
state += carry;
|
||||
res -= carry << base_log;
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -43,23 +42,19 @@ __device__ Torus decompose_one(Torus &state, Torus mod_b_mask,
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
|
||||
Torus *ksk,
|
||||
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in, Torus *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
int lwe_lower, int lwe_upper, int cutoff) {
|
||||
uint32_t lwe_dimension_after, uint32_t base_log,
|
||||
uint32_t l_gadget, int lwe_lower, int lwe_upper,
|
||||
int cutoff) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
Torus *local_lwe_out = (Torus *)sharedmem;
|
||||
|
||||
auto block_lwe_in =
|
||||
get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
|
||||
auto block_lwe_out =
|
||||
get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
|
||||
auto block_lwe_in = get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
|
||||
auto block_lwe_out = get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
|
||||
|
||||
auto gadget = GadgetMatrixSingle<Torus>(base_log, l_gadget);
|
||||
|
||||
@@ -77,26 +72,22 @@ __global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
local_lwe_out[lwe_dimension_after] =
|
||||
block_lwe_in[lwe_dimension_before];
|
||||
local_lwe_out[lwe_dimension_after] = block_lwe_in[lwe_dimension_before];
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_dimension_before; i++) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log,
|
||||
l_gadget);
|
||||
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log, l_gadget);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * l_gadget);
|
||||
Torus mod_b_mask = (1ll << base_log) - 1ll;
|
||||
|
||||
for (int j = 0; j < l_gadget; j++) {
|
||||
auto ksk_block = get_ith_block(ksk, i, l_gadget - j - 1,
|
||||
lwe_dimension_after,
|
||||
l_gadget);
|
||||
Torus decomposed = decompose_one<Torus>(state, mod_b_mask,
|
||||
base_log);
|
||||
lwe_dimension_after, l_gadget);
|
||||
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed;
|
||||
@@ -112,13 +103,10 @@ __global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
|
||||
|
||||
/// assume lwe_in in the gpu
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in,
|
||||
Torus *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
void *v_stream, Torus *lwe_out, Torus *lwe_in, Torus *ksk,
|
||||
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) {
|
||||
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
@@ -136,11 +124,9 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
|
||||
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
|
||||
}
|
||||
|
||||
int lwe_size_after =
|
||||
(lwe_dimension_after + 1) * num_samples;
|
||||
int lwe_size_after = (lwe_dimension_after + 1) * num_samples;
|
||||
|
||||
int shared_mem =
|
||||
sizeof(Torus) * (lwe_dimension_after + 1);
|
||||
int shared_mem = sizeof(Torus) * (lwe_dimension_after + 1);
|
||||
|
||||
cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after);
|
||||
|
||||
@@ -156,7 +142,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
|
||||
l_gadget, lwe_lower, lwe_upper, cutoff);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -497,7 +497,6 @@ public:
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
};
|
||||
template <typename T, class params> class Vector {
|
||||
public:
|
||||
|
||||
@@ -30,9 +30,10 @@ __device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(
|
||||
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
__device__ void
|
||||
polynomial_product_in_fourier_domain(PolynomialFourier<FT, params> &result,
|
||||
PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
@@ -72,8 +73,9 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
}
|
||||
|
||||
template <class params, typename T>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
T *result, T *first, T *second) {
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(T *result,
|
||||
T *first,
|
||||
T *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second[tid];
|
||||
|
||||
Reference in New Issue
Block a user