chore(cuda): format sources and add check in ci

This commit is contained in:
Agnes Leroy
2022-10-14 11:57:43 +02:00
committed by Agnès Leroy
parent acbad678ec
commit c22aa3e4e9
15 changed files with 1044 additions and 1318 deletions

View File

@@ -8,171 +8,93 @@ extern "C" {
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index);
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size);
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t l_gadget,
uint32_t polynomial_size);
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size);
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t l_gadget,
uint32_t polynomial_size);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *test_vector,
void *test_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_test_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory);
void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_cmux_tree_32(
void *v_stream,
void *glwe_out,
void *ggsw_in,
void *lut_vector,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t r,
uint32_t max_shared_memory);
void cuda_cmux_tree_32(void *v_stream, void *glwe_out, void *ggsw_in,
void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t r,
uint32_t max_shared_memory);
void cuda_cmux_tree_64(
void *v_stream,
void *glwe_out,
void *ggsw_in,
void *lut_vector,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t r,
uint32_t max_shared_memory);
void cuda_extract_bits_32(
void *v_stream,
void *list_lwe_out,
void *lwe_in,
void *lwe_in_buffer,
void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer,
void *lwe_out_pbs_buffer,
void *lut_pbs,
void *lut_vector_indexes,
void *ksk,
void *fourier_bsk,
uint32_t number_of_bits,
uint32_t delta_log,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t glwe_dimension,
uint32_t base_log_bsk,
uint32_t l_gadget_bsk,
uint32_t base_log_ksk,
uint32_t l_gadget_ksk,
uint32_t number_of_samples);
void cuda_extract_bits_64(
void *v_stream,
void *list_lwe_out,
void *lwe_in,
void *lwe_in_buffer,
void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer,
void *lwe_out_pbs_buffer,
void *lut_pbs,
void *lut_vector_indexes,
void *ksk,
void *fourier_bsk,
uint32_t number_of_bits,
uint32_t delta_log,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t glwe_dimension,
uint32_t base_log_bsk,
uint32_t l_gadget_bsk,
uint32_t base_log_ksk,
uint32_t l_gadget_ksk,
uint32_t number_of_samples);
void cuda_cmux_tree_64(void *v_stream, void *glwe_out, void *ggsw_in,
void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t r,
uint32_t max_shared_memory);
void cuda_extract_bits_32(void *v_stream, void *list_lwe_out, void *lwe_in,
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
void *lut_pbs, void *lut_vector_indexes, void *ksk,
void *fourier_bsk, uint32_t number_of_bits,
uint32_t delta_log, uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
uint32_t number_of_samples);
void cuda_extract_bits_64(void *v_stream, void *list_lwe_out, void *lwe_in,
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
void *lut_pbs, void *lut_vector_indexes, void *ksk,
void *fourier_bsk, uint32_t number_of_bits,
uint32_t delta_log, uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
uint32_t number_of_samples);
};
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
int glwe_dimension, uint32_t l_gadget);
template <typename T>
__device__ T*
get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template <typename T>
__device__ T*
get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
#endif
#endif // CUDA_BOOTSTRAP_H

View File

@@ -5,20 +5,15 @@
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples);
}
#endif // CNCRT_KS_H_

View File

@@ -58,64 +58,57 @@
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
assert(
("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
switch (polynomial_size) {
case 512:
host_bootstrap_amortized<uint32_t, Degree<512>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint32_t, Degree<1024>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint32_t, Degree<2048>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint32_t, Degree<4096>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint32_t, Degree<8192>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
@@ -123,64 +116,57 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
}
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
assert(
("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
switch (polynomial_size) {
case 512:
host_bootstrap_amortized<uint64_t, Degree<512>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint64_t, Degree<1024>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint64_t, Degree<2048>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint64_t, Degree<4096>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint64_t, Degree<8192>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;

View File

@@ -54,18 +54,10 @@ template <typename Torus, class params, sharedMemDegree SMD>
* is not FULLSM
*/
__global__ void device_bootstrap_amortized(
Torus *lwe_out,
Torus *lut_vector,
uint32_t *lut_vector_indexes,
Torus *lwe_in,
double2 *bootstrapping_key,
char *device_mem,
uint32_t lwe_mask_size,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t lwe_idx,
size_t device_memory_size_per_sample) {
Torus *lwe_out, Torus *lut_vector, uint32_t *lut_vector_indexes,
Torus *lwe_in, double2 *bootstrapping_key, char *device_mem,
uint32_t lwe_mask_size, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t lwe_idx, size_t device_memory_size_per_sample) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
@@ -103,8 +95,8 @@ __global__ void device_bootstrap_amortized(
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
2];
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
@@ -114,11 +106,11 @@ __global__ void device_bootstrap_amortized(
2 * params::degree); // 2 * params::log2_degree + 1);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
params::degree / params::opt>(
accumulator_mask, block_lut_vector, b_hat, false);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
params::degree / params::opt>(
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
// Loop over all the mask elements of the sample to accumulate
@@ -147,11 +139,11 @@ __global__ void device_bootstrap_amortized(
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
params::degree / params::opt>(
accumulator_mask_rotated, base_log, l_gadget);
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
params::degree / params::opt>(
accumulator_body_rotated, base_log, l_gadget);
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
@@ -195,13 +187,11 @@ __global__ void device_bootstrap_amortized(
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(
bootstrapping_key, iteration, 0, decomp_level,
polynomial_size, 1, l_gadget));
get_ith_mask_kth_block(bootstrapping_key, iteration, 0, decomp_level,
polynomial_size, 1, l_gadget));
auto bsk_body_slice = PolynomialFourier<double2, params>(
get_ith_body_kth_block(
bootstrapping_key, iteration, 0, decomp_level,
polynomial_size, 1, l_gadget));
get_ith_body_kth_block(bootstrapping_key, iteration, 0, decomp_level,
polynomial_size, 1, l_gadget));
synchronize_threads_in_block();
@@ -230,7 +220,7 @@ __global__ void device_bootstrap_amortized(
polynomial_size, 1, l_gadget));
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level,
polynomial_size, 1, l_gadget));
polynomial_size, 1, l_gadget));
synchronize_threads_in_block();
@@ -305,20 +295,11 @@ __global__ void device_bootstrap_amortized(
template <typename Torus, class params>
__host__ void host_bootstrap_amortized(
void *v_stream,
Torus *lwe_out,
Torus *lut_vector,
uint32_t *lut_vector_indexes,
Torus *lwe_in,
double2 *bootstrapping_key,
uint32_t input_lwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t input_lwe_ciphertext_count,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
void *v_stream, Torus *lwe_out, Torus *lut_vector,
uint32_t *lut_vector_indexes, Torus *lwe_in, double2 *bootstrapping_key,
uint32_t input_lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t input_lwe_ciphertext_count,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
sizeof(Torus) * polynomial_size + // accumulator body
@@ -354,28 +335,24 @@ __host__ void host_bootstrap_amortized(
// from one of three templates (no use, partial use or full use
// of shared memory)
if (max_shared_memory < SM_PART) {
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
device_bootstrap_amortized<Torus, params, NOSM>
<<<grid, thds, 0, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes, lwe_in,
bootstrapping_key, d_mem,
input_lwe_dimension, polynomial_size,
base_log, l_gadget, lwe_idx, DM_FULL);
checkCudaErrors(
cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget,
lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
SM_PART);
cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
checkCudaErrors(
cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes,
lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size,
base_log, l_gadget, lwe_idx,
DM_PART);
<<<grid, thds, SM_PART, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget,
lwe_idx, DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
// address the full capacity of shared memory. Shared memory on the
@@ -384,26 +361,22 @@ __host__ void host_bootstrap_amortized(
// just does nothing and the amount of shared memory used is 48 KB
checkCudaErrors(cudaFuncSetAttribute(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
SM_FULL));
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
checkCudaErrors(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
checkCudaErrors(cudaMalloc((void **)&d_mem, 0));
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes,
lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size,
base_log, l_gadget, lwe_idx,
0);
<<<grid, thds, SM_FULL, *stream>>>(
lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key,
d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget,
lwe_idx, 0);
}
// Synchronize the streams before copying the result to lwe_out at the right
// place
cudaStreamSynchronize(*stream);
cudaFree(d_mem);
}
template <typename Torus, class params>
@@ -415,8 +388,8 @@ int cuda_get_pbs_per_gpu(int polynomial_size) {
cudaDeviceProp device_properties;
cudaGetDeviceProperties(&device_properties, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,
num_threads, 0);
&blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
0);
return device_properties.multiProcessorCount * blocks_per_sm;
}

View File

@@ -57,76 +57,66 @@
* values for the FFT
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget",
num_samples <= number_of_sm / 4. / 2. / l_gadget));
assert(("Error (GPU low latency PBS): base log should be <= 16",
base_log <= 16));
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
"1024, 2048",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"l_gadget",
num_samples <= number_of_sm / 4. / 2. / l_gadget));
switch (polynomial_size) {
case 512:
host_bootstrap_low_latency<uint32_t, Degree<512>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 1024:
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 2048:
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 4096:
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 8192:
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
default:
break;
@@ -134,79 +124,68 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
}
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream,
void *lwe_out,
void *lut_vector,
void *lut_vector_indexes,
void *lwe_in,
void *bootstrapping_key,
uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors,
uint32_t lwe_idx,
uint32_t max_shared_memory) {
void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes,
void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16));
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget",
num_samples <= number_of_sm / 4. / 2. / l_gadget));
assert(("Error (GPU low latency PBS): base log should be <= 16",
base_log <= 16));
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
"1024, 2048",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048));
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU low latency PBS): the number of input LWEs must be lower "
"or equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"l_gadget",
num_samples <= number_of_sm / 4. / 2. / l_gadget));
switch (polynomial_size) {
case 512:
host_bootstrap_low_latency<uint64_t, Degree<512>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 1024:
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 2048:
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 4096:
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
case 8192:
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
base_log, l_gadget, num_samples,
num_lut_vectors);
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
l_gadget, num_samples, num_lut_vectors);
break;
default:
break;
}
}

View File

@@ -29,17 +29,13 @@ namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void
mul_trgsw_trlwe(Torus *accumulator,
double2 *fft,
int16_t *trlwe_decomposed,
double2 *mask_join_buffer,
double2 *body_join_buffer,
double2 *bootstrapping_key,
int polynomial_size, int l_gadget, int iteration, grid_group &grid) {
mul_trgsw_trlwe(Torus *accumulator, double2 *fft, int16_t *trlwe_decomposed,
double2 *mask_join_buffer, double2 *body_join_buffer,
double2 *bootstrapping_key, int polynomial_size, int l_gadget,
int iteration, grid_group &grid) {
// Put the decomposed TRLWE sample in the Fourier domain
real_to_complex_compressed<int16_t, params>(trlwe_decomposed,
fft);
real_to_complex_compressed<int16_t, params>(trlwe_decomposed, fft);
synchronize_threads_in_block();
// Switch to the FFT space
@@ -49,52 +45,49 @@ mul_trgsw_trlwe(Torus *accumulator,
correction_direct_fft_inplace<params>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
polynomial_size, 1, l_gadget));
auto bsk_body_slice = PolynomialFourier<double2, params>(
get_ith_body_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
polynomial_size, 1, l_gadget));
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
blockIdx.x, polynomial_size, 1, l_gadget));
auto bsk_body_slice = PolynomialFourier<double2, params>(
get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
blockIdx.x, polynomial_size, 1, l_gadget));
// Perform the matrix multiplication between the RGSW and the TRLWE,
// each block operating on a single level for mask and body
auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
auto first_processed_bsk =
(blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
auto second_processed_bsk =
(blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
auto first_processed_acc = (blockIdx.y == 0) ?
&mask_join_buffer[params::degree / 2 * blockIdx.x] :
&body_join_buffer[params::degree / 2 * blockIdx.x];
auto second_processed_acc = (blockIdx.y == 0) ?
&body_join_buffer[params::degree / 2 * blockIdx.x] :
&mask_join_buffer[params::degree / 2 * blockIdx.x];
auto first_processed_acc =
(blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x]
: &body_join_buffer[params::degree / 2 * blockIdx.x];
auto second_processed_acc =
(blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x]
: &mask_join_buffer[params::degree / 2 * blockIdx.x];
int tid = threadIdx.x;
//first product
for(int i = 0; i < params::opt / 2; i++) {
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
tid += params::degree / params::opt;
// first product
for (int i = 0; i < params::opt / 2; i++) {
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
tid += params::degree / params::opt;
}
grid.sync();
tid = threadIdx.x;
//second product
for(int i = 0; i < params::opt / 2; i++) {
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
tid += params::degree / params::opt;
}
// second product
for (int i = 0; i < params::opt / 2; i++) {
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
tid += params::degree / params::opt;
}
// -----------------------------------------------------------------
@@ -102,24 +95,24 @@ mul_trgsw_trlwe(Torus *accumulator,
// values needed from every other block
grid.sync();
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
auto cur_src_acc = &src_acc[l * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
@@ -142,49 +135,46 @@ template <typename Torus, class params>
* Kernel launched by the low latency version of the
* bootstrapping, that uses cooperative groups
* lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples
* lut_vector - vector of look up tables with length polynomial_size * num_samples
* lut_vector_indexes - mapping between lwe_in and lut_vector
* lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
* lut_vector - vector of look up tables with length polynomial_size *
* num_samples lut_vector_indexes - mapping between lwe_in and lut_vector lwe_in
* - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
*
*/
__global__ void device_bootstrap_low_latency(
Torus *lwe_out,
Torus *lut_vector,
Torus *lwe_in,
double2 *bootstrapping_key,
double2 *mask_join_buffer,
double2 *body_join_buffer,
uint32_t lwe_mask_size,
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget
) {
Torus *lwe_out, Torus *lut_vector, Torus *lwe_in,
double2 *bootstrapping_key, double2 *mask_join_buffer,
double2 *body_join_buffer, uint32_t lwe_mask_size, uint32_t polynomial_size,
uint32_t base_log, uint32_t l_gadget) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
extern __shared__ char sharedmem[];
char* selected_memory = sharedmem;
char *selected_memory = sharedmem;
int16_t *accumulator_decomposed = (int16_t *)selected_memory;
Torus *accumulator = (Torus*)accumulator_decomposed +
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
double2 *accumulator_fft = (double2*)accumulator +
polynomial_size / (sizeof(double2) / sizeof(Torus));
Torus *accumulator = (Torus *)accumulator_decomposed +
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
double2 *accumulator_fft =
(double2 *)accumulator +
polynomial_size / (sizeof(double2) / sizeof(Torus));
// Reuse memory from accumulator_fft for accumulator_rotated
Torus* accumulator_rotated = (Torus*)accumulator_fft;
Torus *accumulator_rotated = (Torus *)accumulator_fft;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)];
auto block_lut_vector =
&lut_vector[blockIdx.z * params::degree * 2];
auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2];
auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
auto block_mask_join_buffer =
&mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
auto block_body_join_buffer =
&body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
@@ -192,19 +182,17 @@ __global__ void device_bootstrap_low_latency(
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
// Put "b" in [0, 2N[
Torus b_hat = rescale_torus_element(
block_lwe_in[lwe_mask_size],
2 * params::degree);
Torus b_hat =
rescale_torus_element(block_lwe_in[lwe_mask_size], 2 * params::degree);
if (blockIdx.y == 0) {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false);
}
else {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[params::degree], b_hat, false);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false);
} else {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[params::degree], b_hat, false);
}
for (int i = 0; i < lwe_mask_size; i++) {
@@ -217,15 +205,14 @@ __global__ void device_bootstrap_low_latency(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, l_gadget);
params::degree / params::opt>(
accumulator_rotated, base_log, l_gadget);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
@@ -239,15 +226,11 @@ __global__ void device_bootstrap_low_latency(
synchronize_threads_in_block();
// Perform G^-1(ACC) * RGSW -> TRLWE
mul_trgsw_trlwe<Torus, params>(
accumulator,
accumulator_fft,
accumulator_decomposed,
block_mask_join_buffer,
block_body_join_buffer,
bootstrapping_key,
accumulator, accumulator_fft, accumulator_decomposed,
block_mask_join_buffer, block_body_join_buffer, bootstrapping_key,
polynomial_size, l_gadget, i, grid);
}
auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)];
if (blockIdx.x == 0 && blockIdx.y == 0) {
@@ -258,41 +241,31 @@ __global__ void device_bootstrap_low_latency(
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
sample_extract_body<Torus, params>(block_lwe_out, accumulator);
}
}
/*
* Host wrapper to the low latency version
* of bootstrapping
*/
template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
void *v_stream,
Torus *lwe_out,
Torus *lut_vector,
uint32_t *lut_vector_indexes,
Torus *lwe_in,
double2 *bootstrapping_key,
uint32_t lwe_mask_size,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples,
uint32_t num_lut_vectors) {
void *v_stream, Torus *lwe_out, Torus *lut_vector,
uint32_t *lut_vector_indexes, Torus *lwe_in, double2 *bootstrapping_key,
uint32_t lwe_mask_size, uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors) {
auto stream = static_cast<cudaStream_t *>(v_stream);
int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
int buffer_size_per_gpu =
l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
double2 *mask_buffer_fft;
double2 *body_buffer_fft;
checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
int bytes_needed =
sizeof(int16_t) * polynomial_size + // accumulator_decomp
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
int thds = polynomial_size / params::opt;
dim3 grid(l_gadget, 2, num_samples);
@@ -307,17 +280,18 @@ __host__ void host_bootstrap_low_latency(
kernel_args[6] = &lwe_mask_size;
kernel_args[7] = &polynomial_size;
kernel_args[8] = &base_log;
kernel_args[9] =&l_gadget;
kernel_args[9] = &l_gadget;
checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency<Torus,
params>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
bytes_needed));
checkCudaErrors(cudaFuncSetAttribute(
device_bootstrap_low_latency<Torus, params>,
cudaFuncAttributeMaxDynamicSharedMemorySize, bytes_needed));
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
cudaFuncCachePreferShared);
checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency<Torus, params>, grid, thds, (void**)kernel_args, bytes_needed, *stream )) ;
cudaFuncCachePreferShared);
checkCudaErrors(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_low_latency<Torus, params>, grid, thds,
(void **)kernel_args, bytes_needed, *stream));
// Synchronize the streams before copying the result to lwe_out at the right
// place
cudaStreamSynchronize(*stream);

View File

@@ -1,167 +1,144 @@
#include "bootstrap_wop.cuh"
void cuda_cmux_tree_32(
void *v_stream,
void *glwe_out,
void *ggsw_in,
void *lut_vector,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t r,
uint32_t max_shared_memory) {
void cuda_cmux_tree_32(void *v_stream, void *glwe_out, void *ggsw_in,
void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t r,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
r >= 1));
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
switch (polynomial_size) {
case 512:
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream,
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 1024:
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
v_stream,
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 2048:
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
v_stream,
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 4096:
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
v_stream,
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 8192:
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
v_stream,
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
default:
break;
}
switch (polynomial_size) {
case 512:
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 1024:
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 2048:
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 4096:
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 8192:
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in,
(uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
default:
break;
}
}
void cuda_cmux_tree_64(
void *v_stream,
void *glwe_out,
void *ggsw_in,
void *lut_vector,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t r,
uint32_t max_shared_memory) {
void cuda_cmux_tree_64(void *v_stream, void *glwe_out, void *ggsw_in,
void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t r,
uint32_t max_shared_memory) {
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ",
r >= 1));
assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16));
assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
"2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
switch (polynomial_size) {
case 512:
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream,
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 1024:
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
v_stream,
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 2048:
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
v_stream,
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 4096:
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
v_stream,
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
case 8192:
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
v_stream,
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
glwe_dimension, polynomial_size, base_log, l_gadget, r,
max_shared_memory);
break;
default:
break;
}
switch (polynomial_size) {
case 512:
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 1024:
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 2048:
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 4096:
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
case 8192:
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in,
(uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log,
l_gadget, r, max_shared_memory);
break;
default:
break;
}
}
void cuda_extract_bits_32(
void *v_stream,
void *list_lwe_out,
void *lwe_in,
void *lwe_in_buffer,
void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer,
void *lwe_out_pbs_buffer,
void *lut_pbs,
void *lut_vector_indexes,
void *ksk,
void *fourier_bsk,
uint32_t number_of_bits,
uint32_t delta_log,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t glwe_dimension,
uint32_t base_log_bsk,
uint32_t l_gadget_bsk,
uint32_t base_log_ksk,
uint32_t l_gadget_ksk,
uint32_t number_of_samples)
{
assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
lwe_dimension_before == 2048));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
void cuda_extract_bits_32(void *v_stream, void *list_lwe_out, void *lwe_in,
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
void *lut_pbs, void *lut_vector_indexes, void *ksk,
void *fourier_bsk, uint32_t number_of_bits,
uint32_t delta_log, uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
uint32_t number_of_samples) {
assert(("Error (GPU extract bits): base log should be <= 16",
base_log_bsk <= 16));
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU extract bits): lwe_dimension_before should be one of "
"512, 1024, 2048",
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
lwe_dimension_before == 2048));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"l_gadget_bsk",
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
switch (lwe_dimension_before) {
case 512:
@@ -170,9 +147,9 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer,
(uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer,
(uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log,
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
base_log_ksk, l_gadget_ksk, number_of_samples);
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
l_gadget_ksk, number_of_samples);
break;
case 1024:
host_extract_bits<uint32_t, Degree<1024>>(
@@ -180,9 +157,9 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer,
(uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer,
(uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log,
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
base_log_ksk, l_gadget_ksk, number_of_samples);
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
l_gadget_ksk, number_of_samples);
break;
case 2048:
host_extract_bits<uint32_t, Degree<2048>>(
@@ -190,55 +167,44 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer,
(uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer,
(uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log,
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
base_log_ksk, l_gadget_ksk, number_of_samples);
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
l_gadget_ksk, number_of_samples);
break;
default:
break;
}
}
void cuda_extract_bits_64(
void *v_stream,
void *list_lwe_out,
void *lwe_in,
void *lwe_in_buffer,
void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer,
void *lwe_out_pbs_buffer,
void *lut_pbs,
void *lut_vector_indexes,
void *ksk,
void *fourier_bsk,
uint32_t number_of_bits,
uint32_t delta_log,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t glwe_dimension,
uint32_t base_log_bsk,
uint32_t l_gadget_bsk,
uint32_t base_log_ksk,
uint32_t l_gadget_ksk,
uint32_t number_of_samples)
{
assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16));
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1));
assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048",
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
lwe_dimension_before == 2048));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the "
"number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk",
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
void cuda_extract_bits_64(void *v_stream, void *list_lwe_out, void *lwe_in,
void *lwe_in_buffer, void *lwe_in_shifted_buffer,
void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer,
void *lut_pbs, void *lut_vector_indexes, void *ksk,
void *fourier_bsk, uint32_t number_of_bits,
uint32_t delta_log, uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after, uint32_t glwe_dimension,
uint32_t base_log_bsk, uint32_t l_gadget_bsk,
uint32_t base_log_ksk, uint32_t l_gadget_ksk,
uint32_t number_of_samples) {
assert(("Error (GPU extract bits): base log should be <= 16",
base_log_bsk <= 16));
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU extract bits): lwe_dimension_before should be one of "
"512, 1024, 2048",
lwe_dimension_before == 512 || lwe_dimension_before == 1024 ||
lwe_dimension_before == 2048));
// The number of samples should be lower than the number of streaming
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
// to the occupancy of 50%). The only supported value for k is 1, so
// k + 1 = 2 for now.
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
"equal to the "
"number of streaming multiprocessors on the device divided by 8 * "
"l_gadget_bsk",
number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk));
switch (lwe_dimension_before) {
case 512:
@@ -247,9 +213,9 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer,
(uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer,
(uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log,
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
base_log_ksk, l_gadget_ksk, number_of_samples);
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
l_gadget_ksk, number_of_samples);
break;
case 1024:
host_extract_bits<uint64_t, Degree<1024>>(
@@ -257,9 +223,9 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer,
(uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer,
(uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log,
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
base_log_ksk, l_gadget_ksk, number_of_samples);
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
l_gadget_ksk, number_of_samples);
break;
case 2048:
host_extract_bits<uint64_t, Degree<2048>>(
@@ -267,14 +233,11 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer,
(uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer,
(uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk,
(double2 *)fourier_bsk, number_of_bits, delta_log,
lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk,
base_log_ksk, l_gadget_ksk, number_of_samples);
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before,
lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk,
l_gadget_ksk, number_of_samples);
break;
default:
break;
}
}

View File

@@ -5,79 +5,80 @@
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "bootstrap_low_latency.cuh"
#include "complex/operations.cuh"
#include "crypto/ggsw.cuh"
#include "crypto/torus.cuh"
#include "fft/bnsmfft.cuh"
#include "fft/smfft.cuh"
#include "fft/twiddles.cuh"
#include "keyswitch.cuh"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include "polynomial/polynomial_math.cuh"
#include "utils/memory.cuh"
#include "utils/timer.cuh"
#include "keyswitch.cuh"
#include "bootstrap_low_latency.cuh"
#include "crypto/ggsw.cuh"
template <typename T, class params>
__device__ void fft(double2 *output, T *input){
synchronize_threads_in_block();
__device__ void fft(double2 *output, T *input) {
synchronize_threads_in_block();
// Reduce the size of the FFT to be performed by storing
// the real-valued polynomial into a complex polynomial
real_to_complex_compressed<T, params>(input, output);
synchronize_threads_in_block();
// Reduce the size of the FFT to be performed by storing
// the real-valued polynomial into a complex polynomial
real_to_complex_compressed<T, params>(input, output);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(output);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(output);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(output);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(output);
synchronize_threads_in_block();
}
template <typename T, typename ST, class params>
__device__ void fft(double2 *output, T *input){
synchronize_threads_in_block();
__device__ void fft(double2 *output, T *input) {
synchronize_threads_in_block();
// Reduce the size of the FFT to be performed by storing
// the real-valued polynomial into a complex polynomial
real_to_complex_compressed<T, ST, params>(input, output);
synchronize_threads_in_block();
// Reduce the size of the FFT to be performed by storing
// the real-valued polynomial into a complex polynomial
real_to_complex_compressed<T, ST, params>(input, output);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(output);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(output);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(output);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(output);
synchronize_threads_in_block();
}
template <class params>
__device__ void ifft_inplace(double2 *data){
synchronize_threads_in_block();
template <class params> __device__ void ifft_inplace(double2 *data) {
synchronize_threads_in_block();
correction_inverse_fft_inplace<params>(data);
synchronize_threads_in_block();
correction_inverse_fft_inplace<params>(data);
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(data);
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(data);
synchronize_threads_in_block();
}
/*
* Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this array,
* and an array of GGSW ciphertexts with a index to one ciphertext in it. Compute a CMUX with these
* operands and writes the output to a particular index of glwe_out.
* Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this
* array, and an array of GGSW ciphertexts with a index to one ciphertext in it.
* Compute a CMUX with these operands and writes the output to a particular
* index of glwe_out.
*
* This function needs polynomial_size threads per block.
*
* - glwe_out: An array where the result should be written to.
* - glwe_in: An array where the GLWE inputs are stored.
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
* - selected_memory: An array to be used for the accumulators. Can be in the shared memory or
* global memory.
* - output_idx: The index of the output where the glwe ciphertext should be written.
* - selected_memory: An array to be used for the accumulators. Can be in the
* shared memory or global memory.
* - output_idx: The index of the output where the glwe ciphertext should be
* written.
* - input_idx1: The index of the first glwe ciphertext we will use.
* - input_idx2: The index of the second glwe ciphertext we will use.
* - glwe_dim: This is k.
@@ -87,154 +88,146 @@ __device__ void ifft_inplace(double2 *data){
* - ggsw_idx: The index of the GGSW we will use.
*/
template <typename Torus, typename STorus, class params>
__device__ void cmux(
Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in, char *selected_memory,
uint32_t output_idx, uint32_t input_idx1, uint32_t input_idx2,
uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
uint32_t ggsw_idx){
__device__ void cmux(Torus *glwe_out, Torus *glwe_in, double2 *ggsw_in,
char *selected_memory, uint32_t output_idx,
uint32_t input_idx1, uint32_t input_idx2,
uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t base_log, uint32_t l_gadget, uint32_t ggsw_idx) {
// Define glwe_sub
Torus *glwe_sub_mask = (Torus *) selected_memory;
Torus *glwe_sub_body = (Torus *) glwe_sub_mask + (ptrdiff_t)polynomial_size;
// Define glwe_sub
Torus *glwe_sub_mask = (Torus *)selected_memory;
Torus *glwe_sub_body = (Torus *)glwe_sub_mask + (ptrdiff_t)polynomial_size;
int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size);
int16_t *glwe_body_decomposed =
int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size);
int16_t *glwe_body_decomposed =
(int16_t *)glwe_mask_decomposed + (ptrdiff_t)polynomial_size;
double2 *mask_res_fft = (double2 *)(glwe_body_decomposed +
polynomial_size);
double2 *body_res_fft =
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
double2 *mask_res_fft = (double2 *)(glwe_body_decomposed + polynomial_size);
double2 *body_res_fft =
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
double2 *glwe_fft =
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
double2 *glwe_fft =
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
/////////////////////////////////////
/////////////////////////////////////
// glwe2-glwe1
// glwe2-glwe1
// Copy m0 to shared memory to preserve data
auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
auto m0_body = m0_mask + polynomial_size;
// Copy m0 to shared memory to preserve data
auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
auto m0_body = m0_mask + polynomial_size;
// Just gets the pointer for m1 on global memory
auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
auto m1_body = m1_mask + polynomial_size;
// Just gets the pointer for m1 on global memory
auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
auto m1_body = m1_mask + polynomial_size;
// Mask
sub_polynomial<Torus, params>(
glwe_sub_mask, m1_mask, m0_mask
);
// Body
sub_polynomial<Torus, params>(
glwe_sub_body, m1_body, m0_body
);
// Mask
sub_polynomial<Torus, params>(glwe_sub_mask, m1_mask, m0_mask);
// Body
sub_polynomial<Torus, params>(glwe_sub_body, m1_body, m0_body);
synchronize_threads_in_block();
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
mask_res_fft[pos].x = 0;
mask_res_fft[pos].y = 0;
body_res_fft[pos].x = 0;
body_res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
// Subtract each glwe operand, decompose the resulting
// polynomial coefficients to multiply each decomposed level
// with the corresponding part of the LUT
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
// Decomposition
gadget.decompose_one_level(glwe_mask_decomposed, glwe_sub_mask,
decomp_level);
gadget.decompose_one_level(glwe_body_decomposed, glwe_sub_body,
decomp_level);
// First, perform the polynomial multiplication for the mask
synchronize_threads_in_block();
fft<int16_t, params>(glwe_fft, glwe_mask_decomposed);
// External product and accumulate
// Get the piece necessary for the multiplication
auto mask_fourier =
get_ith_mask_kth_block(ggsw_in, ggsw_idx, 0, decomp_level,
polynomial_size, glwe_dim, l_gadget);
auto body_fourier =
get_ith_body_kth_block(ggsw_in, ggsw_idx, 0, decomp_level,
polynomial_size, glwe_dim, l_gadget);
synchronize_threads_in_block();
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
mask_res_fft[pos].x = 0;
mask_res_fft[pos].y = 0;
body_res_fft[pos].x = 0;
body_res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
// Subtract each glwe operand, decompose the resulting
// polynomial coefficients to multiply each decomposed level
// with the corresponding part of the LUT
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
// Decomposition
gadget.decompose_one_level(glwe_mask_decomposed,
glwe_sub_mask,
decomp_level);
gadget.decompose_one_level(glwe_body_decomposed,
glwe_sub_body,
decomp_level);
// First, perform the polynomial multiplication for the mask
synchronize_threads_in_block();
fft<int16_t, params>(glwe_fft, glwe_mask_decomposed);
// External product and accumulate
// Get the piece necessary for the multiplication
auto mask_fourier = get_ith_mask_kth_block(
ggsw_in, ggsw_idx, 0, decomp_level,
polynomial_size, glwe_dim, l_gadget);
auto body_fourier = get_ith_body_kth_block(
ggsw_in, ggsw_idx, 0, decomp_level,
polynomial_size, glwe_dim, l_gadget);
synchronize_threads_in_block();
// Perform the coefficient-wise product
synchronize_threads_in_block();
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, glwe_fft, mask_fourier);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, glwe_fft, body_fourier);
// Now handle the polynomial multiplication for the body
// in the same way
synchronize_threads_in_block();
fft<int16_t, params>(glwe_fft, glwe_body_decomposed);
// External product and accumulate
// Get the piece necessary for the multiplication
mask_fourier = get_ith_mask_kth_block(
ggsw_in, ggsw_idx, 1, decomp_level,
polynomial_size, glwe_dim, l_gadget);
body_fourier = get_ith_body_kth_block(
ggsw_in, ggsw_idx, 1, decomp_level,
polynomial_size, glwe_dim, l_gadget);
synchronize_threads_in_block();
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, glwe_fft, mask_fourier);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, glwe_fft, body_fourier);
}
// IFFT
// Perform the coefficient-wise product
synchronize_threads_in_block();
ifft_inplace<params>(mask_res_fft);
ifft_inplace<params>(body_res_fft);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, glwe_fft, mask_fourier);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, glwe_fft, body_fourier);
// Now handle the polynomial multiplication for the body
// in the same way
synchronize_threads_in_block();
fft<int16_t, params>(glwe_fft, glwe_body_decomposed);
// External product and accumulate
// Get the piece necessary for the multiplication
mask_fourier = get_ith_mask_kth_block(ggsw_in, ggsw_idx, 1, decomp_level,
polynomial_size, glwe_dim, l_gadget);
body_fourier = get_ith_body_kth_block(ggsw_in, ggsw_idx, 1, decomp_level,
polynomial_size, glwe_dim, l_gadget);
synchronize_threads_in_block();
// Write the output
Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size];
Torus *mb_body = mb_mask + polynomial_size;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, glwe_fft, mask_fourier);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, glwe_fft, body_fourier);
}
int tid = threadIdx.x;
for(int i = 0; i < params::opt; i++){
mb_mask[tid] = m0_mask[tid];
mb_body[tid] = m0_body[tid];
tid += params::degree / params::opt;
}
// IFFT
synchronize_threads_in_block();
ifft_inplace<params>(mask_res_fft);
ifft_inplace<params>(body_res_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(mask_res_fft, mb_mask);
add_to_torus<Torus, params>(body_res_fft, mb_body);
// Write the output
Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size];
Torus *mb_body = mb_mask + polynomial_size;
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
mb_mask[tid] = m0_mask[tid];
mb_body[tid] = m0_body[tid];
tid += params::degree / params::opt;
}
add_to_torus<Torus, params>(mask_res_fft, mb_mask);
add_to_torus<Torus, params>(body_res_fft, mb_body);
}
/**
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW ciphertext.
* The GLWE ciphertexts are picked two-by-two in sequence. Each thread block computes a single CMUX.
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW
* ciphertext. The GLWE ciphertexts are picked two-by-two in sequence. Each
* thread block computes a single CMUX.
*
* - glwe_out: An array where the result should be written to.
* - glwe_in: An array where the GLWE inputs are stored.
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
* - device_mem: An pointer for the global memory in case the shared memory is not big enough to
* store the accumulators.
* - device_memory_size_per_block: Memory size needed to store all accumulators for a single block.
* - device_mem: An pointer for the global memory in case the shared memory is
* not big enough to store the accumulators.
* - device_memory_size_per_block: Memory size needed to store all accumulators
* for a single block.
* - glwe_dim: This is k.
* - polynomial_size: size of the polynomials. This is N.
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
@@ -242,34 +235,29 @@ __device__ void cmux(
* - ggsw_idx: The index of the GGSW we will use.
*/
template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
__global__ void device_batch_cmux(
Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in,
char *device_mem, size_t device_memory_size_per_block,
uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
uint32_t ggsw_idx){
__global__ void
device_batch_cmux(Torus *glwe_out, Torus *glwe_in, double2 *ggsw_in,
char *device_mem, size_t device_memory_size_per_block,
uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t base_log, uint32_t l_gadget, uint32_t ggsw_idx) {
int cmux_idx = blockIdx.x;
int output_idx = cmux_idx;
int input_idx1 = (cmux_idx << 1);
int input_idx2 = (cmux_idx << 1) + 1;
int cmux_idx = blockIdx.x;
int output_idx = cmux_idx;
int input_idx1 = (cmux_idx << 1);
int input_idx2 = (cmux_idx << 1) + 1;
// We use shared memory for intermediate result
extern __shared__ char sharedmem[];
char *selected_memory;
// We use shared memory for intermediate result
extern __shared__ char sharedmem[];
char *selected_memory;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block];
cmux<Torus, STorus, params>(
glwe_out, glwe_in, ggsw_in,
selected_memory,
output_idx, input_idx1, input_idx2,
glwe_dim, polynomial_size,
base_log, l_gadget,
ggsw_idx);
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
else
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block];
cmux<Torus, STorus, params>(glwe_out, glwe_in, ggsw_in, selected_memory,
output_idx, input_idx1, input_idx2, glwe_dim,
polynomial_size, base_log, l_gadget, ggsw_idx);
}
/*
* This kernel executes the CMUX tree used by the hybrid packing of the WoPBS.
@@ -279,242 +267,222 @@ __global__ void device_batch_cmux(
* - v_stream: The CUDA stream that should be used.
* - glwe_out: A device array for the output GLWE ciphertext.
* - ggsw_in: A device array for the GGSW ciphertexts used in each layer.
* - lut_vector: A device array for the GLWE ciphertexts used in the first layer.
* - lut_vector: A device array for the GLWE ciphertexts used in the first
* layer.
* - polynomial_size: size of the polynomials. This is N.
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
* - r: Number of layers in the tree.
*/
template <typename Torus, typename STorus, class params>
void host_cmux_tree(
void *v_stream,
Torus *glwe_out,
Torus *ggsw_in,
Torus *lut_vector,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t base_log,
uint32_t l_gadget,
uint32_t r,
uint32_t max_shared_memory) {
void host_cmux_tree(void *v_stream, Torus *glwe_out, Torus *ggsw_in,
Torus *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t l_gadget, uint32_t r, uint32_t max_shared_memory) {
auto stream = static_cast<cudaStream_t *>(v_stream);
int num_lut = (1<<r);
auto stream = static_cast<cudaStream_t *>(v_stream);
int num_lut = (1 << r);
cuda_initialize_twiddles(polynomial_size, 0);
cuda_initialize_twiddles(polynomial_size, 0);
int memory_needed_per_block =
sizeof(Torus) * polynomial_size + // glwe_sub_mask
sizeof(Torus) * polynomial_size + // glwe_sub_body
sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed
sizeof(int16_t) * polynomial_size + // glwe_body_decomposed
sizeof(double2) * polynomial_size/2 + // mask_res_fft
sizeof(double2) * polynomial_size/2 + // body_res_fft
sizeof(double2) * polynomial_size/2; // glwe_fft
int memory_needed_per_block =
sizeof(Torus) * polynomial_size + // glwe_sub_mask
sizeof(Torus) * polynomial_size + // glwe_sub_body
sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed
sizeof(int16_t) * polynomial_size + // glwe_body_decomposed
sizeof(double2) * polynomial_size / 2 + // mask_res_fft
sizeof(double2) * polynomial_size / 2 + // body_res_fft
sizeof(double2) * polynomial_size / 2; // glwe_fft
dim3 thds(polynomial_size / params::opt, 1, 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
//////////////////////
// std::cout << "Applying the FFT on m^tree" << std::endl;
double2 *d_ggsw_fft_in;
int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
//////////////////////
double2 *d_ggsw_fft_in;
int ggsw_size = r * polynomial_size * (glwe_dimension + 1) *
(glwe_dimension + 1) * l_gadget;
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double)));
#else
checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double), *stream));
#endif
#if (CUDART_VERSION < 11020)
checkCudaErrors(
cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double)));
#else
checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in,
ggsw_size * sizeof(double), *stream));
#endif
batch_fft_ggsw_vector<Torus, STorus, params>(
v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, l_gadget);
batch_fft_ggsw_vector<Torus, STorus, params>(v_stream, d_ggsw_fft_in, ggsw_in,
r, glwe_dimension,
polynomial_size, l_gadget);
//////////////////////
//////////////////////
// Allocate global memory in case parameters are too large
char *d_mem;
if (max_shared_memory < memory_needed_per_block) {
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaMalloc((void **) &d_mem, memory_needed_per_block * (1 << (r - 1))));
#else
checkCudaErrors(cudaMallocAsync((void **) &d_mem, memory_needed_per_block * (1 << (r - 1)), *stream));
#endif
}else{
checkCudaErrors(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
memory_needed_per_block));
checkCudaErrors(cudaFuncSetCacheConfig(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncCachePreferShared));
}
// Allocate global memory in case parameters are too large
char *d_mem;
if (max_shared_memory < memory_needed_per_block) {
#if (CUDART_VERSION < 11020)
checkCudaErrors(
cudaMalloc((void **)&d_mem, memory_needed_per_block * (1 << (r - 1))));
#else
checkCudaErrors(cudaMallocAsync(
(void **)&d_mem, memory_needed_per_block * (1 << (r - 1)), *stream));
#endif
} else {
checkCudaErrors(cudaFuncSetAttribute(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
checkCudaErrors(
cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncCachePreferShared));
}
// Allocate buffers
int glwe_size = (glwe_dimension + 1) * polynomial_size;
Torus *d_buffer1, *d_buffer2;
// Allocate buffers
int glwe_size = (glwe_dimension + 1) * polynomial_size;
Torus *d_buffer1, *d_buffer2;
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus)));
checkCudaErrors(cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus)));
#else
checkCudaErrors(cudaMallocAsync((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream));
checkCudaErrors(cudaMallocAsync((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream));
#endif
checkCudaErrors(cudaMemcpyAsync(
d_buffer1, lut_vector,
num_lut * glwe_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
#if (CUDART_VERSION < 11020)
checkCudaErrors(
cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus)));
checkCudaErrors(
cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus)));
#else
checkCudaErrors(cudaMallocAsync(
(void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream));
checkCudaErrors(cudaMallocAsync(
(void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream));
#endif
checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector,
num_lut * glwe_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
Torus *output;
// Run the cmux tree
for(int layer_idx = 0; layer_idx < r; layer_idx++){
output = (layer_idx % 2? d_buffer1 : d_buffer2);
Torus *input = (layer_idx % 2? d_buffer2 : d_buffer1);
Torus *output;
// Run the cmux tree
for (int layer_idx = 0; layer_idx < r; layer_idx++) {
output = (layer_idx % 2 ? d_buffer1 : d_buffer2);
Torus *input = (layer_idx % 2 ? d_buffer2 : d_buffer1);
int num_cmuxes = (1<<(r-1-layer_idx));
dim3 grid(num_cmuxes, 1, 1);
int num_cmuxes = (1 << (r - 1 - layer_idx));
dim3 grid(num_cmuxes, 1, 1);
// walks horizontally through the leafs
if(max_shared_memory < memory_needed_per_block)
device_batch_cmux<Torus, STorus, params, NOSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
output, input, d_ggsw_fft_in,
d_mem, memory_needed_per_block,
glwe_dimension, // k
polynomial_size, base_log, l_gadget,
layer_idx // r
);
else
device_batch_cmux<Torus, STorus, params, FULLSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
output, input, d_ggsw_fft_in,
d_mem, memory_needed_per_block,
glwe_dimension, // k
polynomial_size, base_log, l_gadget,
layer_idx // r
);
// walks horizontally through the leafs
if (max_shared_memory < memory_needed_per_block)
device_batch_cmux<Torus, STorus, params, NOSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
glwe_dimension, // k
polynomial_size, base_log, l_gadget,
layer_idx // r
);
else
device_batch_cmux<Torus, STorus, params, FULLSM>
<<<grid, thds, memory_needed_per_block, *stream>>>(
output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
glwe_dimension, // k
polynomial_size, base_log, l_gadget,
layer_idx // r
);
}
}
checkCudaErrors(cudaMemcpyAsync(
glwe_out, output, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
checkCudaErrors(cudaMemcpyAsync(
glwe_out, output,
(glwe_dimension+1) * polynomial_size * sizeof(Torus),
cudaMemcpyDeviceToDevice, *stream));
// We only need synchronization to assert that data is in glwe_out before
// returning. Memory release can be added to the stream and processed
// later.
checkCudaErrors(cudaStreamSynchronize(*stream));
// Free memory
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaFree(d_ggsw_fft_in));
checkCudaErrors(cudaFree(d_buffer1));
checkCudaErrors(cudaFree(d_buffer2));
if(max_shared_memory < memory_needed_per_block)
checkCudaErrors(cudaFree(d_mem));
#else
checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream));
checkCudaErrors(cudaFreeAsync(d_buffer1, *stream));
checkCudaErrors(cudaFreeAsync(d_buffer2, *stream));
if(max_shared_memory < memory_needed_per_block)
checkCudaErrors(cudaFreeAsync(d_mem, *stream));
#endif
// We only need synchronization to assert that data is in glwe_out before
// returning. Memory release can be added to the stream and processed
// later.
checkCudaErrors(cudaStreamSynchronize(*stream));
// Free memory
#if (CUDART_VERSION < 11020)
checkCudaErrors(cudaFree(d_ggsw_fft_in));
checkCudaErrors(cudaFree(d_buffer1));
checkCudaErrors(cudaFree(d_buffer2));
if (max_shared_memory < memory_needed_per_block)
checkCudaErrors(cudaFree(d_mem));
#else
checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream));
checkCudaErrors(cudaFreeAsync(d_buffer1, *stream));
checkCudaErrors(cudaFreeAsync(d_buffer2, *stream));
if (max_shared_memory < memory_needed_per_block)
checkCudaErrors(cudaFreeAsync(d_mem, *stream));
#endif
}
// only works for big lwe for ks+bs case
// state_lwe_buffer is copied from big lwe input
// shifted_lwe_buffer is scalar multiplication of lwe input
// blockIdx.x refers to input ciphertext id
template <typename Torus, class params>
__global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
Torus *src, Torus value)
{
int blockId = blockIdx.x;
int tid = threadIdx.x;
auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)];
auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)];
auto cur_src = &src[blockId * (params::degree + 1)];
Torus *src, Torus value) {
int blockId = blockIdx.x;
int tid = threadIdx.x;
auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)];
auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)];
auto cur_src = &src[blockId * (params::degree + 1)];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_dst_copy[tid] = cur_src[tid];
cur_dst_shift[tid] = cur_src[tid] * value;
tid += params::degree / params::opt;
}
for (int i = 0; i < params::opt; i++) {
cur_dst_copy[tid] = cur_src[tid];
cur_dst_shift[tid] = cur_src[tid] * value;
tid += params::degree / params::opt;
}
if (threadIdx.x == params::degree / params::opt - 1) {
cur_dst_copy[params::degree] = cur_src[params::degree];
cur_dst_shift[params::degree] = cur_src[params::degree] * value;
}
if (threadIdx.x == params::degree / params::opt - 1) {
cur_dst_copy[params::degree] = cur_src[params::degree];
cur_dst_shift[params::degree] = cur_src[params::degree] * value;
}
}
// only works for small lwe in ks+bs case
// function copies lwe when length is not a power of two
template <typename Torus>
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, uint32_t number_of_bits,
uint32_t lwe_id)
{
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
uint32_t number_of_bits, uint32_t lwe_id) {
size_t blockId = blockIdx.x;
size_t threads_per_block = blockDim.x;
size_t opt = small_lwe_size / threads_per_block;
size_t rem = small_lwe_size & (threads_per_block - 1);
auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits];
auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size];
auto cur_src = &src[blockId * small_lwe_size];
size_t blockId = blockIdx.x;
size_t threads_per_block = blockDim.x;
size_t opt = small_lwe_size / threads_per_block;
size_t rem = small_lwe_size & (threads_per_block - 1);
auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits];
auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size];
auto cur_src = &src[blockId * small_lwe_size];
size_t tid = threadIdx.x;
for (int i = 0; i < opt; i++) {
cur_dst[tid] = cur_src[tid];
tid += threads_per_block;
}
if (threadIdx.x < rem)
cur_dst[tid] = cur_src[tid];
size_t tid = threadIdx.x;
for (int i = 0; i < opt; i++) {
cur_dst[tid] = cur_src[tid];
tid += threads_per_block;
}
if (threadIdx.x < rem)
cur_dst[tid] = cur_src[tid];
}
// only used in extract bits for one ciphertext
// should be called with one block and one thread
// NOTE: check if putting this functionality in copy_small_lwe or
// fill_pbs_lut vector is faster
template <typename Torus>
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension,
Torus value) {
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
}
// Fill lut(only body) for the current bit (equivalent to trivial encryption as
// mask is 0s)
// The LUT is filled with -alpha in each coefficient where alpha = delta*2^{bit_idx-1}
// The LUT is filled with -alpha in each coefficient where alpha =
// delta*2^{bit_idx-1}
template <typename Torus, class params>
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value)
{
Torus *cur_poly = &lut[params::degree];
size_t tid = threadIdx.x;
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
Torus *cur_poly = &lut[params::degree];
size_t tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
cur_poly[tid] = value;
tid += params::degree / params::opt;
}
for (int i = 0; i < params::opt; i++) {
cur_poly[tid] = value;
tid += params::degree / params::opt;
}
}
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 if the
// extracted bit was 0 and 1 in the other case
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
// if the extracted bit was 0 and 1 in the other case
//
// Remove the extracted bit from the state LWE to get a 0 at the extracted bit
// location.
@@ -525,8 +493,7 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value)
template <typename Torus, class params>
__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
Torus *pbs_lwe_out, Torus add_value,
Torus mul_value)
{
Torus mul_value) {
size_t tid = threadIdx.x;
size_t blockId = blockIdx.x;
auto cur_shifted_lwe = &shifted_lwe[blockId * (params::degree + 1)];
@@ -546,81 +513,57 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
}
}
template <typename Torus, class params>
__host__ void host_extract_bits(
void *v_stream,
Torus *list_lwe_out,
Torus *lwe_in,
Torus *lwe_in_buffer,
Torus *lwe_in_shifted_buffer,
Torus *lwe_out_ks_buffer,
Torus *lwe_out_pbs_buffer,
Torus *lut_pbs,
uint32_t *lut_vector_indexes,
Torus *ksk,
double2 *fourier_bsk,
uint32_t number_of_bits,
uint32_t delta_log,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log_bsk,
uint32_t l_gadget_bsk,
uint32_t base_log_ksk,
uint32_t l_gadget_ksk,
uint32_t number_of_samples)
{
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
void *v_stream, Torus *list_lwe_out, Torus *lwe_in, Torus *lwe_in_buffer,
Torus *lwe_in_shifted_buffer, Torus *lwe_out_ks_buffer,
Torus *lwe_out_pbs_buffer, Torus *lut_pbs, uint32_t *lut_vector_indexes,
Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits,
uint32_t delta_log, uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after, uint32_t base_log_bsk, uint32_t l_gadget_bsk,
uint32_t base_log_ksk, uint32_t l_gadget_ksk, uint32_t number_of_samples) {
int blocks = 1;
int threads = params::degree / params::opt;
auto stream = static_cast<cudaStream_t *>(v_stream);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>
(lwe_in_buffer, lwe_in_shifted_buffer, lwe_in,
1ll << (ciphertext_n_bits - delta_log - 1));
int blocks = 1;
int threads = params::degree / params::opt;
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
cuda_keyswitch_lwe_ciphertext_vector(v_stream, lwe_out_ks_buffer,
lwe_in_shifted_buffer, ksk,
lwe_dimension_before,
lwe_dimension_after, base_log_ksk,
l_gadget_ksk, 1);
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
lwe_in_buffer, lwe_in_shifted_buffer, lwe_in,
1ll << (ciphertext_n_bits - delta_log - 1));
copy_small_lwe<<<1, 256, 0, *stream>>>(list_lwe_out,
lwe_out_ks_buffer,
lwe_dimension_after + 1,
number_of_bits,
number_of_bits - bit_idx - 1);
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, lwe_out_ks_buffer, lwe_in_shifted_buffer, ksk,
lwe_dimension_before, lwe_dimension_after, base_log_ksk, l_gadget_ksk,
1);
if (bit_idx == number_of_bits - 1) {
break;
}
copy_small_lwe<<<1, 256, 0, *stream>>>(
list_lwe_out, lwe_out_ks_buffer, lwe_dimension_after + 1,
number_of_bits, number_of_bits - bit_idx - 1);
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_out_ks_buffer,
lwe_dimension_after,
1ll << (ciphertext_n_bits - 2));
fill_lut_body_for_current_bit<Torus, params>
<<<blocks, threads, 0,*stream>>> (lut_pbs, 0ll - 1ll << (
delta_log - 1 +
bit_idx));
host_bootstrap_low_latency<Torus, params>(v_stream, lwe_out_pbs_buffer,
lut_pbs, lut_vector_indexes,
lwe_out_ks_buffer, fourier_bsk,
lwe_dimension_after, lwe_dimension_before,
base_log_bsk, l_gadget_bsk, number_of_samples,
1);
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
lwe_in_shifted_buffer, lwe_in_buffer, lwe_out_pbs_buffer,
1ll << (delta_log - 1 + bit_idx),
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) );
if (bit_idx == number_of_bits - 1) {
break;
}
add_to_body<Torus><<<1, 1, 0, *stream>>>(
lwe_out_ks_buffer, lwe_dimension_after, 1ll << (ciphertext_n_bits - 2));
fill_lut_body_for_current_bit<Torus, params>
<<<blocks, threads, 0, *stream>>>(
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
host_bootstrap_low_latency<Torus, params>(
v_stream, lwe_out_pbs_buffer, lut_pbs, lut_vector_indexes,
lwe_out_ks_buffer, fourier_bsk, lwe_dimension_after,
lwe_dimension_before, base_log_bsk, l_gadget_bsk, number_of_samples, 1);
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
lwe_in_shifted_buffer, lwe_in_buffer, lwe_out_pbs_buffer,
1ll << (delta_log - 1 + bit_idx),
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
}
}
#endif //WO_PBS_H
#endif // WO_PBS_H

View File

@@ -10,26 +10,29 @@
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget) {
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
l_gadget;
}
template <typename T>
__device__ T*
get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget) {
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1)];
}
template <typename T>
__device__ T*
get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1) +
polynomial_size / 2];
__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget) {
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1) +
polynomial_size / 2];
}
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
@@ -65,21 +68,21 @@ void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
template <typename T, typename ST>
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size) {
uint32_t gpu_index, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t l_gadget,
uint32_t polynomial_size) {
cudaSetDevice(gpu_index);
int shared_memory_size = sizeof(double) * polynomial_size;
int total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
l_gadget;
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * l_gadget;
// Here the buffer size is the size of double2 times the number of polynomials
// times the polynomial size over 2 because the polynomials are compressed
// into the complex domain to perform the FFT
size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof
(double2);
size_t buffer_size =
total_polynomials * polynomial_size / 2 * sizeof(double2);
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt(polynomial_size);
@@ -110,23 +113,23 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
switch (polynomial_size) {
case 512:
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 1024:
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 2048:
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 4096:
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
case 8192:
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
break;
default:
break;
@@ -134,44 +137,58 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
cudaFree(d_bsk);
free(h_bsk);
}
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size) {
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>((double2 *)dest, (int32_t *)src,
v_stream, gpu_index, input_lwe_dim,
glwe_dim, l_gadget, polynomial_size);
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t l_gadget,
uint32_t polynomial_size) {
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
(double2 *)dest, (int32_t *)src, v_stream, gpu_index, input_lwe_dim,
glwe_dim, l_gadget, polynomial_size);
}
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
uint32_t l_gadget, uint32_t polynomial_size) {
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>((double2 *)dest, (int64_t *)src,
v_stream, gpu_index, input_lwe_dim,
glwe_dim, l_gadget, polynomial_size);
uint32_t gpu_index,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t l_gadget,
uint32_t polynomial_size) {
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
(double2 *)dest, (int64_t *)src, v_stream, gpu_index, input_lwe_dim,
glwe_dim, l_gadget, polynomial_size);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ uint64_t*
get_ith_mask_kth_block(uint64_t* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template __device__ uint32_t*
get_ith_mask_kth_block(uint32_t* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template __device__ double2*
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template __device__ uint64_t*
get_ith_body_kth_block(uint64_t *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template __device__ uint32_t*
get_ith_body_kth_block(uint32_t *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template __device__ double2*
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
int glwe_dimension, uint32_t l_gadget);
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t l_gadget);
#endif // CNCRT_BSK_H

View File

@@ -2,51 +2,49 @@
#define CONCRETE_CORE_GGSW_CUH
template <typename T, typename ST, class params>
__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src){
__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src) {
extern __shared__ char sharedmem[];
extern __shared__ char sharedmem[];
double2 *shared_output = (double2*) sharedmem;
double2 *shared_output = (double2 *)sharedmem;
// Compression
int offset = blockIdx.x * blockDim.x;
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt >> 1; i++) {
ST x = src[(2 * tid) + params::opt * offset];
ST y = src[(2 * tid + 1) + params::opt * offset];
shared_output[tid].x = x / (double)std::numeric_limits<T>::max();
shared_output[tid].y = y / (double)std::numeric_limits<T>::max();
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Compression
int offset = blockIdx.x * blockDim.x;
int tid = threadIdx.x;
int log_2_opt = params::opt >> 1;
#pragma unroll
for (int i = 0; i < log_2_opt; i++) {
ST x = src[(2 * tid) + params::opt * offset];
ST y = src[(2 * tid + 1) + params::opt * offset];
shared_output[tid].x = x / (double)std::numeric_limits<T>::max();
shared_output[tid].y = y / (double)std::numeric_limits<T>::max();
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(shared_output);
synchronize_threads_in_block();
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(shared_output);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(shared_output);
synchronize_threads_in_block();
correction_direct_fft_inplace<params>(shared_output);
synchronize_threads_in_block();
// Write the output to global memory
tid = threadIdx.x;
for (int j = 0; j < params::opt >> 1; j++) {
dest[tid + (params::opt >> 1) * offset] = shared_output[tid];
tid += params::degree / params::opt;
}
// Write the output to global memory
tid = threadIdx.x;
for (int j = 0; j < log_2_opt; j++) {
dest[tid + (params::opt >> 1) * offset] = shared_output[tid];
tid += params::degree / params::opt;
}
}
/**
* Applies the FFT transform on sequence of GGSW ciphertexts already in the global memory
* Applies the FFT transform on sequence of GGSW ciphertexts already in the
* global memory
*/
template <typename T, typename ST, class params>
void batch_fft_ggsw_vector(
void *v_stream,
double2 *dest, T *src,
uint32_t r,
uint32_t glwe_dim,
uint32_t polynomial_size,
uint32_t l_gadget) {
void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r,
uint32_t glwe_dim, uint32_t polynomial_size,
uint32_t l_gadget) {
auto stream = static_cast<cudaStream_t *>(v_stream);
@@ -56,11 +54,9 @@ void batch_fft_ggsw_vector(
int gridSize = total_polynomials;
int blockSize = polynomial_size / params::opt;
batch_fft_ggsw_vectors<T, ST, params><<<gridSize, blockSize, shared_memory_size, *stream>>>(dest,
src);
batch_fft_ggsw_vectors<T, ST, params>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src);
checkCudaErrors(cudaGetLastError());
}
#endif //CONCRETE_CORE_GGSW_CUH
#endif // CONCRETE_CORE_GGSW_CUH

View File

@@ -62,7 +62,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
// error code: zero copy size
return -3;
}
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
@@ -75,8 +75,8 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
}
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaSetDevice(gpu_index);
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice,
*stream));
checkCudaErrors(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
return 0;
}
@@ -117,8 +117,8 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
}
auto stream = static_cast<cudaStream_t *>(v_stream);
cudaSetDevice(gpu_index);
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost,
*stream));
checkCudaErrors(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
return 0;
}

View File

@@ -14,18 +14,15 @@
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, static_cast<uint32_t *>(lwe_out), static_cast<uint32_t *>(lwe_in),
static_cast<uint32_t*>(ksk),
lwe_dimension_before, lwe_dimension_after,
base_log, l_gadget,
num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, static_cast<uint32_t *>(lwe_out),
static_cast<uint32_t *>(lwe_in), static_cast<uint32_t *>(ksk),
lwe_dimension_before, lwe_dimension_after, base_log, l_gadget,
num_samples);
}
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
@@ -38,18 +35,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
*/
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
void *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget,
uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *> (lwe_in),
static_cast<uint64_t*>(ksk),
lwe_dimension_before, lwe_dimension_after,
base_log, l_gadget,
num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *v_stream, void *lwe_out, void *lwe_in, void *ksk,
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
v_stream, static_cast<uint64_t *>(lwe_out),
static_cast<uint64_t *>(lwe_in), static_cast<uint64_t *>(ksk),
lwe_dimension_before, lwe_dimension_after, base_log, l_gadget,
num_samples);
}

View File

@@ -9,24 +9,23 @@
template <typename Torus>
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
uint32_t lwe_dimension_after,
uint32_t l_gadget) {
int pos = i * l_gadget * (lwe_dimension_after + 1) +
level * (lwe_dimension_after + 1);
Torus *ptr = &ksk[pos];
return ptr;
uint32_t lwe_dimension_after,
uint32_t l_gadget) {
int pos = i * l_gadget * (lwe_dimension_after + 1) +
level * (lwe_dimension_after + 1);
Torus *ptr = &ksk[pos];
return ptr;
}
template <typename Torus>
__device__ Torus decompose_one(Torus &state, Torus mod_b_mask,
int base_log) {
Torus res = state & mod_b_mask;
state >>= base_log;
Torus carry = ((res - 1ll) | state) & res;
carry >>= base_log - 1;
state += carry;
res -= carry << base_log;
return res;
__device__ Torus decompose_one(Torus &state, Torus mod_b_mask, int base_log) {
Torus res = state & mod_b_mask;
state >>= base_log;
Torus carry = ((res - 1ll) | state) & res;
carry >>= base_log - 1;
state += carry;
res -= carry << base_log;
return res;
}
/*
@@ -43,23 +42,19 @@ __device__ Torus decompose_one(Torus &state, Torus mod_b_mask,
*
*/
template <typename Torus>
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
Torus *ksk,
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in, Torus *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log,
uint32_t l_gadget,
int lwe_lower, int lwe_upper, int cutoff) {
uint32_t lwe_dimension_after, uint32_t base_log,
uint32_t l_gadget, int lwe_lower, int lwe_upper,
int cutoff) {
int tid = threadIdx.x;
extern __shared__ char sharedmem[];
Torus *local_lwe_out = (Torus *)sharedmem;
auto block_lwe_in =
get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
auto block_lwe_out =
get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
auto block_lwe_in = get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
auto block_lwe_out = get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
auto gadget = GadgetMatrixSingle<Torus>(base_log, l_gadget);
@@ -77,26 +72,22 @@ __global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
}
if (tid == 0) {
local_lwe_out[lwe_dimension_after] =
block_lwe_in[lwe_dimension_before];
local_lwe_out[lwe_dimension_after] = block_lwe_in[lwe_dimension_before];
}
for (int i = 0; i < lwe_dimension_before; i++) {
__syncthreads();
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log,
l_gadget);
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log, l_gadget);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * l_gadget);
Torus mod_b_mask = (1ll << base_log) - 1ll;
for (int j = 0; j < l_gadget; j++) {
auto ksk_block = get_ith_block(ksk, i, l_gadget - j - 1,
lwe_dimension_after,
l_gadget);
Torus decomposed = decompose_one<Torus>(state, mod_b_mask,
base_log);
lwe_dimension_after, l_gadget);
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
for (int k = 0; k < lwe_part_per_thd; k++) {
int idx = tid + k * blockDim.x;
local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed;
@@ -112,13 +103,10 @@ __global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
/// assume lwe_in in the gpu
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in,
Torus *ksk,
uint32_t lwe_dimension_before,
uint32_t lwe_dimension_after,
uint32_t base_log,
uint32_t l_gadget,
uint32_t num_samples) {
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
void *v_stream, Torus *lwe_out, Torus *lwe_in, Torus *ksk,
uint32_t lwe_dimension_before, uint32_t lwe_dimension_after,
uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) {
constexpr int ideal_threads = 128;
@@ -136,11 +124,9 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
}
int lwe_size_after =
(lwe_dimension_after + 1) * num_samples;
int lwe_size_after = (lwe_dimension_after + 1) * num_samples;
int shared_mem =
sizeof(Torus) * (lwe_dimension_after + 1);
int shared_mem = sizeof(Torus) * (lwe_dimension_after + 1);
cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after);
@@ -156,7 +142,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
l_gadget, lwe_lower, lwe_upper, cutoff);
cudaStreamSynchronize(*stream);
}
#endif

View File

@@ -497,7 +497,6 @@ public:
}
synchronize_threads_in_block();
}
};
template <typename T, class params> class Vector {
public:

View File

@@ -30,9 +30,10 @@ __device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
}
template <class params, typename FT>
__device__ void polynomial_product_in_fourier_domain(
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
PolynomialFourier<FT, params> &second) {
__device__ void
polynomial_product_in_fourier_domain(PolynomialFourier<FT, params> &result,
PolynomialFourier<FT, params> &first,
PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
@@ -72,8 +73,9 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
}
template <class params, typename T>
__device__ void polynomial_product_accumulate_in_fourier_domain(
T *result, T *first, T *second) {
__device__ void polynomial_product_accumulate_in_fourier_domain(T *result,
T *first,
T *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += first[tid] * second[tid];