diff --git a/include/bootstrap.h b/include/bootstrap.h index 6177fdf65..b81798a45 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -8,171 +8,93 @@ extern "C" { void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index); void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream, - uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim, - uint32_t l_gadget, uint32_t polynomial_size); + uint32_t gpu_index, + uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t l_gadget, + uint32_t polynomial_size); void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream, - uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim, - uint32_t l_gadget, uint32_t polynomial_size); + uint32_t gpu_index, + uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t l_gadget, + uint32_t polynomial_size); void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( - void *v_stream, - void *lwe_out, - void *test_vector, - void *test_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_test_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory); + void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory); void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( - void *v_stream, - void *lwe_out, - void *test_vector, - void *test_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_test_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory); + void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory); void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( - void *v_stream, - void *lwe_out, - void *test_vector, - void *test_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_test_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory); + void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory); void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( - void *v_stream, - void *lwe_out, - void *test_vector, - void *test_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_test_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory); + void *v_stream, void *lwe_out, void *test_vector, void *test_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_test_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory); -void cuda_cmux_tree_32( - void *v_stream, - void *glwe_out, - void *ggsw_in, - void *lut_vector, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t r, - uint32_t max_shared_memory); +void cuda_cmux_tree_32(void *v_stream, void *glwe_out, void *ggsw_in, + void *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t r, + uint32_t max_shared_memory); -void cuda_cmux_tree_64( - void *v_stream, - void *glwe_out, - void *ggsw_in, - void *lut_vector, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t r, - uint32_t max_shared_memory); - - - -void cuda_extract_bits_32( - void *v_stream, - void *list_lwe_out, - void *lwe_in, - void *lwe_in_buffer, - void *lwe_in_shifted_buffer, - void *lwe_out_ks_buffer, - void *lwe_out_pbs_buffer, - void *lut_pbs, - void *lut_vector_indexes, - void *ksk, - void *fourier_bsk, - uint32_t number_of_bits, - uint32_t delta_log, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t glwe_dimension, - uint32_t base_log_bsk, - uint32_t l_gadget_bsk, - uint32_t base_log_ksk, - uint32_t l_gadget_ksk, - uint32_t number_of_samples); - - -void cuda_extract_bits_64( - void *v_stream, - void *list_lwe_out, - void *lwe_in, - void *lwe_in_buffer, - void *lwe_in_shifted_buffer, - void *lwe_out_ks_buffer, - void *lwe_out_pbs_buffer, - void *lut_pbs, - void *lut_vector_indexes, - void *ksk, - void *fourier_bsk, - uint32_t number_of_bits, - uint32_t delta_log, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t glwe_dimension, - uint32_t base_log_bsk, - uint32_t l_gadget_bsk, - uint32_t base_log_ksk, - uint32_t l_gadget_ksk, - uint32_t number_of_samples); +void cuda_cmux_tree_64(void *v_stream, void *glwe_out, void *ggsw_in, + void *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t r, + uint32_t max_shared_memory); +void cuda_extract_bits_32(void *v_stream, void *list_lwe_out, void *lwe_in, + void *lwe_in_buffer, void *lwe_in_shifted_buffer, + void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer, + void *lut_pbs, void *lut_vector_indexes, void *ksk, + void *fourier_bsk, uint32_t number_of_bits, + uint32_t delta_log, uint32_t lwe_dimension_before, + uint32_t lwe_dimension_after, uint32_t glwe_dimension, + uint32_t base_log_bsk, uint32_t l_gadget_bsk, + uint32_t base_log_ksk, uint32_t l_gadget_ksk, + uint32_t number_of_samples); +void cuda_extract_bits_64(void *v_stream, void *list_lwe_out, void *lwe_in, + void *lwe_in_buffer, void *lwe_in_shifted_buffer, + void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer, + void *lut_pbs, void *lut_vector_indexes, void *ksk, + void *fourier_bsk, uint32_t number_of_bits, + uint32_t delta_log, uint32_t lwe_dimension_before, + uint32_t lwe_dimension_after, uint32_t glwe_dimension, + uint32_t base_log_bsk, uint32_t l_gadget_bsk, + uint32_t base_log_ksk, uint32_t l_gadget_ksk, + uint32_t number_of_samples); }; #ifdef __CUDACC__ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, - int glwe_dimension, - uint32_t l_gadget); + int glwe_dimension, uint32_t l_gadget); template -__device__ T* -get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); +__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level, + uint32_t polynomial_size, + int glwe_dimension, uint32_t l_gadget); template -__device__ T* -get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); +__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level, + uint32_t polynomial_size, + int glwe_dimension, uint32_t l_gadget); #endif #endif // CUDA_BOOTSTRAP_H diff --git a/include/keyswitch.h b/include/keyswitch.h index d74a0f543..da43bc353 100644 --- a/include/keyswitch.h +++ b/include/keyswitch.h @@ -5,20 +5,15 @@ extern "C" { -void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in, - void *ksk, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log, uint32_t l_gadget, - uint32_t num_samples); - -void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in, - void *ksk, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log, uint32_t l_gadget, - uint32_t num_samples); +void cuda_keyswitch_lwe_ciphertext_vector_32( + void *v_stream, void *lwe_out, void *lwe_in, void *ksk, + uint32_t lwe_dimension_before, uint32_t lwe_dimension_after, + uint32_t base_log, uint32_t l_gadget, uint32_t num_samples); +void cuda_keyswitch_lwe_ciphertext_vector_64( + void *v_stream, void *lwe_out, void *lwe_in, void *ksk, + uint32_t lwe_dimension_before, uint32_t lwe_dimension_after, + uint32_t base_log, uint32_t l_gadget, uint32_t num_samples); } #endif // CNCRT_KS_H_ diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu index 5f9af6d76..727848de1 100644 --- a/src/bootstrap_amortized.cu +++ b/src/bootstrap_amortized.cu @@ -58,64 +58,57 @@ */ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( - void *v_stream, - void *lwe_out, - void *lut_vector, - void *lut_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_lut_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory) { + void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory) { - assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16)); - assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", glwe_dimension == 1)); - assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); + assert( + ("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU amortized PBS): polynomial size should be one of 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); switch (polynomial_size) { case 512: host_bootstrap_amortized>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); - break; + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + break; case 1024: host_bootstrap_amortized>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 2048: host_bootstrap_amortized>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 4096: host_bootstrap_amortized>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 8192: host_bootstrap_amortized>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; default: break; @@ -123,64 +116,57 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( } void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( - void *v_stream, - void *lwe_out, - void *lut_vector, - void *lut_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_lut_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory) { + void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory) { + + assert( + ("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU amortized PBS): polynomial size should be one of 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); - assert(("Error (GPU amortized PBS): base log should be <= 16", base_log <= 16)); - assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", glwe_dimension == 1)); - assert(("Error (GPU amortized PBS): polynomial size should be one of 512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - switch (polynomial_size) { case 512: host_bootstrap_amortized>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 1024: host_bootstrap_amortized>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 2048: host_bootstrap_amortized>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 4096: host_bootstrap_amortized>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; case 8192: host_bootstrap_amortized>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, l_gadget, num_samples, - num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); break; default: break; diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 8c4e9eec6..b7b85798b 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -54,18 +54,10 @@ template * is not FULLSM */ __global__ void device_bootstrap_amortized( - Torus *lwe_out, - Torus *lut_vector, - uint32_t *lut_vector_indexes, - Torus *lwe_in, - double2 *bootstrapping_key, - char *device_mem, - uint32_t lwe_mask_size, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t lwe_idx, - size_t device_memory_size_per_sample) { + Torus *lwe_out, Torus *lut_vector, uint32_t *lut_vector_indexes, + Torus *lwe_in, double2 *bootstrapping_key, char *device_mem, + uint32_t lwe_mask_size, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t lwe_idx, size_t device_memory_size_per_sample) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory @@ -103,8 +95,8 @@ __global__ void device_bootstrap_amortized( auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)]; Torus *block_lut_vector = - &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2]; - + &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * + 2]; GadgetMatrix gadget(base_log, l_gadget); @@ -114,11 +106,11 @@ __global__ void device_bootstrap_amortized( 2 * params::degree); // 2 * params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( + params::degree / params::opt>( accumulator_mask, block_lut_vector, b_hat, false); divide_by_monomial_negacyclic_inplace( + params::degree / params::opt>( accumulator_body, &block_lut_vector[params::degree], b_hat, false); // Loop over all the mask elements of the sample to accumulate @@ -147,11 +139,11 @@ __global__ void device_bootstrap_amortized( // Perform a rounding to increase the accuracy of the // bootstrapped ciphertext round_to_closest_multiple_inplace( + params::degree / params::opt>( accumulator_mask_rotated, base_log, l_gadget); round_to_closest_multiple_inplace( + params::degree / params::opt>( accumulator_body_rotated, base_log, l_gadget); // Initialize the polynomial multiplication via FFT arrays // The polynomial multiplications happens at the block level @@ -195,13 +187,11 @@ __global__ void device_bootstrap_amortized( // Get the bootstrapping key piece necessary for the multiplication // It is already in the Fourier domain auto bsk_mask_slice = PolynomialFourier( - get_ith_mask_kth_block( - bootstrapping_key, iteration, 0, decomp_level, - polynomial_size, 1, l_gadget)); + get_ith_mask_kth_block(bootstrapping_key, iteration, 0, decomp_level, + polynomial_size, 1, l_gadget)); auto bsk_body_slice = PolynomialFourier( - get_ith_body_kth_block( - bootstrapping_key, iteration, 0, decomp_level, - polynomial_size, 1, l_gadget)); + get_ith_body_kth_block(bootstrapping_key, iteration, 0, decomp_level, + polynomial_size, 1, l_gadget)); synchronize_threads_in_block(); @@ -230,7 +220,7 @@ __global__ void device_bootstrap_amortized( polynomial_size, 1, l_gadget)); auto bsk_body_slice_2 = PolynomialFourier( get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level, - polynomial_size, 1, l_gadget)); + polynomial_size, 1, l_gadget)); synchronize_threads_in_block(); @@ -305,20 +295,11 @@ __global__ void device_bootstrap_amortized( template __host__ void host_bootstrap_amortized( - void *v_stream, - Torus *lwe_out, - Torus *lut_vector, - uint32_t *lut_vector_indexes, - Torus *lwe_in, - double2 *bootstrapping_key, - uint32_t input_lwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t input_lwe_ciphertext_count, - uint32_t num_lut_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory) { + void *v_stream, Torus *lwe_out, Torus *lut_vector, + uint32_t *lut_vector_indexes, Torus *lwe_in, double2 *bootstrapping_key, + uint32_t input_lwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t input_lwe_ciphertext_count, + uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask sizeof(Torus) * polynomial_size + // accumulator body @@ -354,28 +335,24 @@ __host__ void host_bootstrap_amortized( // from one of three templates (no use, partial use or full use // of shared memory) if (max_shared_memory < SM_PART) { - checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count)); - device_bootstrap_amortized - <<>>( - lwe_out, lut_vector, lut_vector_indexes, lwe_in, - bootstrapping_key, d_mem, - input_lwe_dimension, polynomial_size, - base_log, l_gadget, lwe_idx, DM_FULL); + checkCudaErrors( + cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count)); + device_bootstrap_amortized<<>>( + lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key, + d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget, + lwe_idx, DM_FULL); } else if (max_shared_memory < SM_FULL) { cudaFuncSetAttribute(device_bootstrap_amortized, - cudaFuncAttributeMaxDynamicSharedMemorySize, - SM_PART); - cudaFuncSetCacheConfig( - device_bootstrap_amortized, - cudaFuncCachePreferShared); - checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count)); + cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART); + cudaFuncSetCacheConfig(device_bootstrap_amortized, + cudaFuncCachePreferShared); + checkCudaErrors( + cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count)); device_bootstrap_amortized - <<>>( - lwe_out, lut_vector, lut_vector_indexes, - lwe_in, bootstrapping_key, - d_mem, input_lwe_dimension, polynomial_size, - base_log, l_gadget, lwe_idx, - DM_PART); + <<>>( + lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key, + d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget, + lwe_idx, DM_PART); } else { // For devices with compute capability 7.x a single thread block can // address the full capacity of shared memory. Shared memory on the @@ -384,26 +361,22 @@ __host__ void host_bootstrap_amortized( // just does nothing and the amount of shared memory used is 48 KB checkCudaErrors(cudaFuncSetAttribute( device_bootstrap_amortized, - cudaFuncAttributeMaxDynamicSharedMemorySize, - SM_FULL)); + cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL)); checkCudaErrors(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); checkCudaErrors(cudaMalloc((void **)&d_mem, 0)); device_bootstrap_amortized - <<>>( - lwe_out, lut_vector, lut_vector_indexes, - lwe_in, bootstrapping_key, - d_mem, input_lwe_dimension, polynomial_size, - base_log, l_gadget, lwe_idx, - 0); + <<>>( + lwe_out, lut_vector, lut_vector_indexes, lwe_in, bootstrapping_key, + d_mem, input_lwe_dimension, polynomial_size, base_log, l_gadget, + lwe_idx, 0); } // Synchronize the streams before copying the result to lwe_out at the right // place cudaStreamSynchronize(*stream); cudaFree(d_mem); - } template @@ -415,8 +388,8 @@ int cuda_get_pbs_per_gpu(int polynomial_size) { cudaDeviceProp device_properties; cudaGetDeviceProperties(&device_properties, 0); cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks_per_sm, device_bootstrap_amortized, - num_threads, 0); + &blocks_per_sm, device_bootstrap_amortized, num_threads, + 0); return device_properties.multiProcessorCount * blocks_per_sm; } diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 848d4ddaf..b6e6d5f78 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -57,76 +57,66 @@ * values for the FFT */ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( - void *v_stream, - void *lwe_out, - void *lut_vector, - void *lut_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_lut_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory) { + void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory) { - assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16)); - assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048", - polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048)); - // The number of samples should be lower than SM/(4 * (k + 1) * l) (the - // factor 4 being related to the occupancy of 50%). The only supported - // value for k is 1, so k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the " - "number of streaming multiprocessors on the device divided by 8 * l_gadget", - num_samples <= number_of_sm / 4. / 2. / l_gadget)); + assert(("Error (GPU low latency PBS): base log should be <= 16", + base_log <= 16)); + assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " + "1024, 2048", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048)); + // The number of samples should be lower than SM/(4 * (k + 1) * l) (the + // factor 4 being related to the occupancy of 50%). The only supported + // value for k is 1, so k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " + "or equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "l_gadget", + num_samples <= number_of_sm / 4. / 2. / l_gadget)); switch (polynomial_size) { case 512: host_bootstrap_low_latency>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 1024: host_bootstrap_low_latency>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 2048: host_bootstrap_low_latency>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 4096: host_bootstrap_low_latency>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 8192: host_bootstrap_low_latency>( v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; default: break; @@ -134,79 +124,68 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( } void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( - void *v_stream, - void *lwe_out, - void *lut_vector, - void *lut_vector_indexes, - void *lwe_in, - void *bootstrapping_key, - uint32_t lwe_dimension, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_lut_vectors, - uint32_t lwe_idx, - uint32_t max_shared_memory) { + void *v_stream, void *lwe_out, void *lut_vector, void *lut_vector_indexes, + void *lwe_in, void *bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors, + uint32_t lwe_idx, uint32_t max_shared_memory) { - assert(("Error (GPU low latency PBS): base log should be <= 16", base_log <= 16)); - assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU low latency PBS): polynomial size should be one of 512, 1024, 2048", - polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048)); - // The number of samples should be lower than SM/(4 * (k + 1) * l) (the - // factor 4 being related to the occupancy of 50%). The only supported - // value for k is 1, so k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU low latency PBS): the number of input LWEs must be lower or equal to the " - "number of streaming multiprocessors on the device divided by 8 * l_gadget", - num_samples <= number_of_sm / 4. / 2. / l_gadget)); + assert(("Error (GPU low latency PBS): base log should be <= 16", + base_log <= 16)); + assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " + "1024, 2048", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048)); + // The number of samples should be lower than SM/(4 * (k + 1) * l) (the + // factor 4 being related to the occupancy of 50%). The only supported + // value for k is 1, so k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU low latency PBS): the number of input LWEs must be lower " + "or equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "l_gadget", + num_samples <= number_of_sm / 4. / 2. / l_gadget)); switch (polynomial_size) { case 512: host_bootstrap_low_latency>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 1024: host_bootstrap_low_latency>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 2048: host_bootstrap_low_latency>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 4096: host_bootstrap_low_latency>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; case 8192: host_bootstrap_low_latency>( v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, - base_log, l_gadget, num_samples, - num_lut_vectors); + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + l_gadget, num_samples, num_lut_vectors); break; default: break; } } - diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 8b845aed5..00c0b7ed4 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -29,17 +29,13 @@ namespace cg = cooperative_groups; template __device__ void -mul_trgsw_trlwe(Torus *accumulator, - double2 *fft, - int16_t *trlwe_decomposed, - double2 *mask_join_buffer, - double2 *body_join_buffer, - double2 *bootstrapping_key, - int polynomial_size, int l_gadget, int iteration, grid_group &grid) { +mul_trgsw_trlwe(Torus *accumulator, double2 *fft, int16_t *trlwe_decomposed, + double2 *mask_join_buffer, double2 *body_join_buffer, + double2 *bootstrapping_key, int polynomial_size, int l_gadget, + int iteration, grid_group &grid) { // Put the decomposed TRLWE sample in the Fourier domain - real_to_complex_compressed(trlwe_decomposed, - fft); + real_to_complex_compressed(trlwe_decomposed, fft); synchronize_threads_in_block(); // Switch to the FFT space @@ -49,52 +45,49 @@ mul_trgsw_trlwe(Torus *accumulator, correction_direct_fft_inplace(fft); synchronize_threads_in_block(); - - // Get the pieces of the bootstrapping key that will be needed for the // external product; blockIdx.x is the ID of the block that's executing // this function, so we end up getting the lines of the bootstrapping key // needed to perform the external product in this block (corresponding to // the same decomposition level) - auto bsk_mask_slice = PolynomialFourier( - get_ith_mask_kth_block( - bootstrapping_key, iteration, blockIdx.y, blockIdx.x, - polynomial_size, 1, l_gadget)); - auto bsk_body_slice = PolynomialFourier( - get_ith_body_kth_block( - bootstrapping_key, iteration, blockIdx.y, blockIdx.x, - polynomial_size, 1, l_gadget)); + auto bsk_mask_slice = PolynomialFourier( + get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y, + blockIdx.x, polynomial_size, 1, l_gadget)); + auto bsk_body_slice = PolynomialFourier( + get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y, + blockIdx.x, polynomial_size, 1, l_gadget)); // Perform the matrix multiplication between the RGSW and the TRLWE, // each block operating on a single level for mask and body - auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice; - auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice; + auto first_processed_bsk = + (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice; + auto second_processed_bsk = + (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice; - auto first_processed_acc = (blockIdx.y == 0) ? - &mask_join_buffer[params::degree / 2 * blockIdx.x] : - &body_join_buffer[params::degree / 2 * blockIdx.x]; - auto second_processed_acc = (blockIdx.y == 0) ? - &body_join_buffer[params::degree / 2 * blockIdx.x] : - &mask_join_buffer[params::degree / 2 * blockIdx.x]; + auto first_processed_acc = + (blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x] + : &body_join_buffer[params::degree / 2 * blockIdx.x]; + auto second_processed_acc = + (blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x] + : &mask_join_buffer[params::degree / 2 * blockIdx.x]; int tid = threadIdx.x; - //first product - for(int i = 0; i < params::opt / 2; i++) { - first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid]; - tid += params::degree / params::opt; + // first product + for (int i = 0; i < params::opt / 2; i++) { + first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid]; + tid += params::degree / params::opt; } grid.sync(); tid = threadIdx.x; - //second product - for(int i = 0; i < params::opt / 2; i++) { - second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid]; - tid += params::degree / params::opt; - } - + // second product + for (int i = 0; i < params::opt / 2; i++) { + second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid]; + tid += params::degree / params::opt; + } // ----------------------------------------------------------------- @@ -102,24 +95,24 @@ mul_trgsw_trlwe(Torus *accumulator, // values needed from every other block grid.sync(); - auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer; + auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer; // copy first product into fft buffer tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { - fft[tid] = src_acc[tid]; - tid += params::degree / params::opt; + fft[tid] = src_acc[tid]; + tid += params::degree / params::opt; } synchronize_threads_in_block(); // accumulate rest of the products into fft buffer for (int l = 1; l < gridDim.x; l++) { - auto cur_src_acc = &src_acc[l * params::degree / 2]; - tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - fft[tid] += cur_src_acc[tid]; - tid += params::degree / params::opt; - } + auto cur_src_acc = &src_acc[l * params::degree / 2]; + tid = threadIdx.x; + for (int i = 0; i < params::opt / 2; i++) { + fft[tid] += cur_src_acc[tid]; + tid += params::degree / params::opt; + } } synchronize_threads_in_block(); @@ -142,49 +135,46 @@ template * Kernel launched by the low latency version of the * bootstrapping, that uses cooperative groups * lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples - * lut_vector - vector of look up tables with length polynomial_size * num_samples - * lut_vector_indexes - mapping between lwe_in and lut_vector - * lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples + * lut_vector - vector of look up tables with length polynomial_size * + * num_samples lut_vector_indexes - mapping between lwe_in and lut_vector lwe_in + * - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples * */ __global__ void device_bootstrap_low_latency( - Torus *lwe_out, - Torus *lut_vector, - Torus *lwe_in, - double2 *bootstrapping_key, - double2 *mask_join_buffer, - double2 *body_join_buffer, - uint32_t lwe_mask_size, - uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget - ) { + Torus *lwe_out, Torus *lut_vector, Torus *lwe_in, + double2 *bootstrapping_key, double2 *mask_join_buffer, + double2 *body_join_buffer, uint32_t lwe_mask_size, uint32_t polynomial_size, + uint32_t base_log, uint32_t l_gadget) { grid_group grid = this_grid(); - + // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory extern __shared__ char sharedmem[]; - char* selected_memory = sharedmem; + char *selected_memory = sharedmem; int16_t *accumulator_decomposed = (int16_t *)selected_memory; - Torus *accumulator = (Torus*)accumulator_decomposed + - polynomial_size / (sizeof(Torus) / sizeof(int16_t)); - double2 *accumulator_fft = (double2*)accumulator + - polynomial_size / (sizeof(double2) / sizeof(Torus)); + Torus *accumulator = (Torus *)accumulator_decomposed + + polynomial_size / (sizeof(Torus) / sizeof(int16_t)); + double2 *accumulator_fft = + (double2 *)accumulator + + polynomial_size / (sizeof(double2) / sizeof(Torus)); // Reuse memory from accumulator_fft for accumulator_rotated - Torus* accumulator_rotated = (Torus*)accumulator_fft; + Torus *accumulator_rotated = (Torus *)accumulator_fft; // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)]; - auto block_lut_vector = - &lut_vector[blockIdx.z * params::degree * 2]; + auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2]; - auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2]; - auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2]; + auto block_mask_join_buffer = + &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2]; + auto block_body_join_buffer = + &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2]; // Since the space is L1 cache is small, we use the same memory location for // the rotated accumulator and the fft accumulator, since we know that the @@ -192,19 +182,17 @@ __global__ void device_bootstrap_low_latency( GadgetMatrix gadget(base_log, l_gadget); // Put "b" in [0, 2N[ - Torus b_hat = rescale_torus_element( - block_lwe_in[lwe_mask_size], - 2 * params::degree); + Torus b_hat = + rescale_torus_element(block_lwe_in[lwe_mask_size], 2 * params::degree); if (blockIdx.y == 0) { - divide_by_monomial_negacyclic_inplace( - accumulator, block_lut_vector, b_hat, false); - } - else { - divide_by_monomial_negacyclic_inplace( - accumulator, &block_lut_vector[params::degree], b_hat, false); + divide_by_monomial_negacyclic_inplace( + accumulator, block_lut_vector, b_hat, false); + } else { + divide_by_monomial_negacyclic_inplace( + accumulator, &block_lut_vector[params::degree], b_hat, false); } for (int i = 0; i < lwe_mask_size; i++) { @@ -217,15 +205,14 @@ __global__ void device_bootstrap_low_latency( // Perform ACC * (X^รค - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< - Torus, params::opt, params::degree / params::opt>( - accumulator, accumulator_rotated, a_hat); - + Torus, params::opt, params::degree / params::opt>( + accumulator, accumulator_rotated, a_hat); // Perform a rounding to increase the accuracy of the // bootstrapped ciphertext round_to_closest_multiple_inplace( - accumulator_rotated, base_log, l_gadget); + params::degree / params::opt>( + accumulator_rotated, base_log, l_gadget); // Decompose the accumulator. Each block gets one level of the // decomposition, for the mask and the body (so block 0 will have the @@ -239,15 +226,11 @@ __global__ void device_bootstrap_low_latency( synchronize_threads_in_block(); // Perform G^-1(ACC) * RGSW -> TRLWE mul_trgsw_trlwe( - accumulator, - accumulator_fft, - accumulator_decomposed, - block_mask_join_buffer, - block_body_join_buffer, - bootstrapping_key, + accumulator, accumulator_fft, accumulator_decomposed, + block_mask_join_buffer, block_body_join_buffer, bootstrapping_key, polynomial_size, l_gadget, i, grid); } - + auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)]; if (blockIdx.x == 0 && blockIdx.y == 0) { @@ -258,41 +241,31 @@ __global__ void device_bootstrap_low_latency( } else if (blockIdx.x == 0 && blockIdx.y == 1) { sample_extract_body(block_lwe_out, accumulator); } - } - /* * Host wrapper to the low latency version * of bootstrapping */ template __host__ void host_bootstrap_low_latency( - void *v_stream, - Torus *lwe_out, - Torus *lut_vector, - uint32_t *lut_vector_indexes, - Torus *lwe_in, - double2 *bootstrapping_key, - uint32_t lwe_mask_size, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples, - uint32_t num_lut_vectors) { + void *v_stream, Torus *lwe_out, Torus *lut_vector, + uint32_t *lut_vector_indexes, Torus *lwe_in, double2 *bootstrapping_key, + uint32_t lwe_mask_size, uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t num_samples, uint32_t num_lut_vectors) { + auto stream = static_cast(v_stream); - int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2); + int buffer_size_per_gpu = + l_gadget * num_samples * polynomial_size / 2 * sizeof(double2); double2 *mask_buffer_fft; double2 *body_buffer_fft; checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu)); checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu)); - - int bytes_needed = - sizeof(int16_t) * polynomial_size + // accumulator_decomp - sizeof(Torus) * polynomial_size + // accumulator - sizeof(double2) * polynomial_size / 2; // accumulator fft + int bytes_needed = sizeof(int16_t) * polynomial_size + // accumulator_decomp + sizeof(Torus) * polynomial_size + // accumulator + sizeof(double2) * polynomial_size / 2; // accumulator fft int thds = polynomial_size / params::opt; dim3 grid(l_gadget, 2, num_samples); @@ -307,17 +280,18 @@ __host__ void host_bootstrap_low_latency( kernel_args[6] = &lwe_mask_size; kernel_args[7] = &polynomial_size; kernel_args[8] = &base_log; - kernel_args[9] =&l_gadget; + kernel_args[9] = &l_gadget; - checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency, - cudaFuncAttributeMaxDynamicSharedMemorySize, - bytes_needed)); + checkCudaErrors(cudaFuncSetAttribute( + device_bootstrap_low_latency, + cudaFuncAttributeMaxDynamicSharedMemorySize, bytes_needed)); cudaFuncSetCacheConfig(device_bootstrap_low_latency, - cudaFuncCachePreferShared); - - checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency, grid, thds, (void**)kernel_args, bytes_needed, *stream )) ; - + cudaFuncCachePreferShared); + + checkCudaErrors(cudaLaunchCooperativeKernel( + (void *)device_bootstrap_low_latency, grid, thds, + (void **)kernel_args, bytes_needed, *stream)); + // Synchronize the streams before copying the result to lwe_out at the right // place cudaStreamSynchronize(*stream); diff --git a/src/bootstrap_wop.cu b/src/bootstrap_wop.cu index 1220ae3c1..d327a0120 100644 --- a/src/bootstrap_wop.cu +++ b/src/bootstrap_wop.cu @@ -1,167 +1,144 @@ #include "bootstrap_wop.cuh" -void cuda_cmux_tree_32( - void *v_stream, - void *glwe_out, - void *ggsw_in, - void *lut_vector, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t r, - uint32_t max_shared_memory) { +void cuda_cmux_tree_32(void *v_stream, void *glwe_out, void *ggsw_in, + void *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t r, + uint32_t max_shared_memory) { - assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16)); - assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1)); - assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ", - r >= 1)); + assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " + "be >= 1 ", + r >= 1)); - switch (polynomial_size) { - case 512: - host_cmux_tree>( - v_stream, - (uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 1024: - host_cmux_tree>( - v_stream, - (uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 2048: - host_cmux_tree>( - v_stream, - (uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 4096: - host_cmux_tree>( - v_stream, - (uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 8192: - host_cmux_tree>( - v_stream, - (uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - default: - break; - } + switch (polynomial_size) { + case 512: + host_cmux_tree>( + v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 1024: + host_cmux_tree>( + v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 2048: + host_cmux_tree>( + v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 4096: + host_cmux_tree>( + v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 8192: + host_cmux_tree>( + v_stream, (uint32_t *)glwe_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + default: + break; + } } -void cuda_cmux_tree_64( - void *v_stream, - void *glwe_out, - void *ggsw_in, - void *lut_vector, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t r, - uint32_t max_shared_memory) { +void cuda_cmux_tree_64(void *v_stream, void *glwe_out, void *ggsw_in, + void *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t r, + uint32_t max_shared_memory) { - assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16)); - assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || polynomial_size == 2048 || - polynomial_size == 4096 || polynomial_size == 8192)); - // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", glwe_dimension == 1)); - assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should be >= 1 ", - r >= 1)); + assert(("Error (GPU Cmux tree): base log should be <= 16", base_log <= 16)); + assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " + "be >= 1 ", + r >= 1)); - switch (polynomial_size) { - case 512: - host_cmux_tree>( - v_stream, - (uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 1024: - host_cmux_tree>( - v_stream, - (uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 2048: - host_cmux_tree>( - v_stream, - (uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 4096: - host_cmux_tree>( - v_stream, - (uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - case 8192: - host_cmux_tree>( - v_stream, - (uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector, - glwe_dimension, polynomial_size, base_log, l_gadget, r, - max_shared_memory); - break; - default: - break; - } + switch (polynomial_size) { + case 512: + host_cmux_tree>( + v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 1024: + host_cmux_tree>( + v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 2048: + host_cmux_tree>( + v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 4096: + host_cmux_tree>( + v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + case 8192: + host_cmux_tree>( + v_stream, (uint64_t *)glwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + l_gadget, r, max_shared_memory); + break; + default: + break; + } } - -void cuda_extract_bits_32( - void *v_stream, - void *list_lwe_out, - void *lwe_in, - void *lwe_in_buffer, - void *lwe_in_shifted_buffer, - void *lwe_out_ks_buffer, - void *lwe_out_pbs_buffer, - void *lut_pbs, - void *lut_vector_indexes, - void *ksk, - void *fourier_bsk, - uint32_t number_of_bits, - uint32_t delta_log, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t glwe_dimension, - uint32_t base_log_bsk, - uint32_t l_gadget_bsk, - uint32_t base_log_ksk, - uint32_t l_gadget_ksk, - uint32_t number_of_samples) -{ - assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16)); - assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1)); - assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048", - lwe_dimension_before == 512 || lwe_dimension_before == 1024 || - lwe_dimension_before == 2048)); - // The number of samples should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the " - "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk", - number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk)); +void cuda_extract_bits_32(void *v_stream, void *list_lwe_out, void *lwe_in, + void *lwe_in_buffer, void *lwe_in_shifted_buffer, + void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer, + void *lut_pbs, void *lut_vector_indexes, void *ksk, + void *fourier_bsk, uint32_t number_of_bits, + uint32_t delta_log, uint32_t lwe_dimension_before, + uint32_t lwe_dimension_after, uint32_t glwe_dimension, + uint32_t base_log_bsk, uint32_t l_gadget_bsk, + uint32_t base_log_ksk, uint32_t l_gadget_ksk, + uint32_t number_of_samples) { + assert(("Error (GPU extract bits): base log should be <= 16", + base_log_bsk <= 16)); + assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU extract bits): lwe_dimension_before should be one of " + "512, 1024, 2048", + lwe_dimension_before == 512 || lwe_dimension_before == 1024 || + lwe_dimension_before == 2048)); + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "l_gadget_bsk", + number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk)); switch (lwe_dimension_before) { case 512: @@ -170,9 +147,9 @@ void cuda_extract_bits_32( (uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer, (uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, - (double2 *)fourier_bsk, number_of_bits, delta_log, - lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk, - base_log_ksk, l_gadget_ksk, number_of_samples); + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before, + lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk, + l_gadget_ksk, number_of_samples); break; case 1024: host_extract_bits>( @@ -180,9 +157,9 @@ void cuda_extract_bits_32( (uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer, (uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, - (double2 *)fourier_bsk, number_of_bits, delta_log, - lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk, - base_log_ksk, l_gadget_ksk, number_of_samples); + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before, + lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk, + l_gadget_ksk, number_of_samples); break; case 2048: host_extract_bits>( @@ -190,55 +167,44 @@ void cuda_extract_bits_32( (uint32_t *)lwe_in_buffer, (uint32_t *)lwe_in_shifted_buffer, (uint32_t *)lwe_out_ks_buffer, (uint32_t *)lwe_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, - (double2 *)fourier_bsk, number_of_bits, delta_log, - lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk, - base_log_ksk, l_gadget_ksk, number_of_samples); + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before, + lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk, + l_gadget_ksk, number_of_samples); break; default: break; } - } - - -void cuda_extract_bits_64( - void *v_stream, - void *list_lwe_out, - void *lwe_in, - void *lwe_in_buffer, - void *lwe_in_shifted_buffer, - void *lwe_out_ks_buffer, - void *lwe_out_pbs_buffer, - void *lut_pbs, - void *lut_vector_indexes, - void *ksk, - void *fourier_bsk, - uint32_t number_of_bits, - uint32_t delta_log, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t glwe_dimension, - uint32_t base_log_bsk, - uint32_t l_gadget_bsk, - uint32_t base_log_ksk, - uint32_t l_gadget_ksk, - uint32_t number_of_samples) -{ - assert(("Error (GPU extract bits): base log should be <= 16", base_log_bsk <= 16)); - assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1)); - assert(("Error (GPU extract bits): lwe_dimension_before should be one of 512, 1024, 2048", - lwe_dimension_before == 512 || lwe_dimension_before == 1024 || - lwe_dimension_before == 2048)); - // The number of samples should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or equal to the " - "number of streaming multiprocessors on the device divided by 8 * l_gadget_bsk", - number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk)); +void cuda_extract_bits_64(void *v_stream, void *list_lwe_out, void *lwe_in, + void *lwe_in_buffer, void *lwe_in_shifted_buffer, + void *lwe_out_ks_buffer, void *lwe_out_pbs_buffer, + void *lut_pbs, void *lut_vector_indexes, void *ksk, + void *fourier_bsk, uint32_t number_of_bits, + uint32_t delta_log, uint32_t lwe_dimension_before, + uint32_t lwe_dimension_after, uint32_t glwe_dimension, + uint32_t base_log_bsk, uint32_t l_gadget_bsk, + uint32_t base_log_ksk, uint32_t l_gadget_ksk, + uint32_t number_of_samples) { + assert(("Error (GPU extract bits): base log should be <= 16", + base_log_bsk <= 16)); + assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU extract bits): lwe_dimension_before should be one of " + "512, 1024, 2048", + lwe_dimension_before == 512 || lwe_dimension_before == 1024 || + lwe_dimension_before == 2048)); + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "l_gadget_bsk", + number_of_samples <= number_of_sm / 4. / 2. / l_gadget_bsk)); switch (lwe_dimension_before) { case 512: @@ -247,9 +213,9 @@ void cuda_extract_bits_64( (uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer, (uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, - (double2 *)fourier_bsk, number_of_bits, delta_log, - lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk, - base_log_ksk, l_gadget_ksk, number_of_samples); + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before, + lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk, + l_gadget_ksk, number_of_samples); break; case 1024: host_extract_bits>( @@ -257,9 +223,9 @@ void cuda_extract_bits_64( (uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer, (uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, - (double2 *)fourier_bsk, number_of_bits, delta_log, - lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk, - base_log_ksk, l_gadget_ksk, number_of_samples); + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before, + lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk, + l_gadget_ksk, number_of_samples); break; case 2048: host_extract_bits>( @@ -267,14 +233,11 @@ void cuda_extract_bits_64( (uint64_t *)lwe_in_buffer, (uint64_t *)lwe_in_shifted_buffer, (uint64_t *)lwe_out_ks_buffer, (uint64_t *)lwe_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, - (double2 *)fourier_bsk, number_of_bits, delta_log, - lwe_dimension_before, lwe_dimension_after, base_log_bsk, l_gadget_bsk, - base_log_ksk, l_gadget_ksk, number_of_samples); + (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_before, + lwe_dimension_after, base_log_bsk, l_gadget_bsk, base_log_ksk, + l_gadget_ksk, number_of_samples); break; default: break; } - } - - diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index 964d55071..2034e7255 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -5,79 +5,80 @@ #include "../include/helper_cuda.h" #include "bootstrap.h" +#include "bootstrap_low_latency.cuh" #include "complex/operations.cuh" +#include "crypto/ggsw.cuh" #include "crypto/torus.cuh" #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" +#include "keyswitch.cuh" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" #include "polynomial/polynomial_math.cuh" #include "utils/memory.cuh" #include "utils/timer.cuh" -#include "keyswitch.cuh" -#include "bootstrap_low_latency.cuh" -#include "crypto/ggsw.cuh" template -__device__ void fft(double2 *output, T *input){ - synchronize_threads_in_block(); +__device__ void fft(double2 *output, T *input) { + synchronize_threads_in_block(); - // Reduce the size of the FFT to be performed by storing - // the real-valued polynomial into a complex polynomial - real_to_complex_compressed(input, output); - synchronize_threads_in_block(); + // Reduce the size of the FFT to be performed by storing + // the real-valued polynomial into a complex polynomial + real_to_complex_compressed(input, output); + synchronize_threads_in_block(); - // Switch to the FFT space - NSMFFT_direct>(output); - synchronize_threads_in_block(); + // Switch to the FFT space + NSMFFT_direct>(output); + synchronize_threads_in_block(); - correction_direct_fft_inplace(output); - synchronize_threads_in_block(); + correction_direct_fft_inplace(output); + synchronize_threads_in_block(); } template -__device__ void fft(double2 *output, T *input){ - synchronize_threads_in_block(); +__device__ void fft(double2 *output, T *input) { + synchronize_threads_in_block(); - // Reduce the size of the FFT to be performed by storing - // the real-valued polynomial into a complex polynomial - real_to_complex_compressed(input, output); - synchronize_threads_in_block(); + // Reduce the size of the FFT to be performed by storing + // the real-valued polynomial into a complex polynomial + real_to_complex_compressed(input, output); + synchronize_threads_in_block(); - // Switch to the FFT space - NSMFFT_direct>(output); - synchronize_threads_in_block(); + // Switch to the FFT space + NSMFFT_direct>(output); + synchronize_threads_in_block(); - correction_direct_fft_inplace(output); - synchronize_threads_in_block(); + correction_direct_fft_inplace(output); + synchronize_threads_in_block(); } -template -__device__ void ifft_inplace(double2 *data){ - synchronize_threads_in_block(); +template __device__ void ifft_inplace(double2 *data) { + synchronize_threads_in_block(); - correction_inverse_fft_inplace(data); - synchronize_threads_in_block(); + correction_inverse_fft_inplace(data); + synchronize_threads_in_block(); - NSMFFT_inverse>(data); - synchronize_threads_in_block(); + NSMFFT_inverse>(data); + synchronize_threads_in_block(); } /* - * Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this array, - * and an array of GGSW ciphertexts with a index to one ciphertext in it. Compute a CMUX with these - * operands and writes the output to a particular index of glwe_out. + * Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this + * array, and an array of GGSW ciphertexts with a index to one ciphertext in it. + * Compute a CMUX with these operands and writes the output to a particular + * index of glwe_out. * * This function needs polynomial_size threads per block. * * - glwe_out: An array where the result should be written to. * - glwe_in: An array where the GLWE inputs are stored. * - ggsw_in: An array where the GGSW input is stored. In the fourier domain. - * - selected_memory: An array to be used for the accumulators. Can be in the shared memory or - * global memory. - * - output_idx: The index of the output where the glwe ciphertext should be written. + * - selected_memory: An array to be used for the accumulators. Can be in the + * shared memory or global memory. + * - output_idx: The index of the output where the glwe ciphertext should be + * written. * - input_idx1: The index of the first glwe ciphertext we will use. * - input_idx2: The index of the second glwe ciphertext we will use. * - glwe_dim: This is k. @@ -87,154 +88,146 @@ __device__ void ifft_inplace(double2 *data){ * - ggsw_idx: The index of the GGSW we will use. */ template -__device__ void cmux( - Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in, char *selected_memory, - uint32_t output_idx, uint32_t input_idx1, uint32_t input_idx2, - uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, - uint32_t ggsw_idx){ +__device__ void cmux(Torus *glwe_out, Torus *glwe_in, double2 *ggsw_in, + char *selected_memory, uint32_t output_idx, + uint32_t input_idx1, uint32_t input_idx2, + uint32_t glwe_dim, uint32_t polynomial_size, + uint32_t base_log, uint32_t l_gadget, uint32_t ggsw_idx) { - // Define glwe_sub - Torus *glwe_sub_mask = (Torus *) selected_memory; - Torus *glwe_sub_body = (Torus *) glwe_sub_mask + (ptrdiff_t)polynomial_size; + // Define glwe_sub + Torus *glwe_sub_mask = (Torus *)selected_memory; + Torus *glwe_sub_body = (Torus *)glwe_sub_mask + (ptrdiff_t)polynomial_size; - int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size); - int16_t *glwe_body_decomposed = + int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size); + int16_t *glwe_body_decomposed = (int16_t *)glwe_mask_decomposed + (ptrdiff_t)polynomial_size; - double2 *mask_res_fft = (double2 *)(glwe_body_decomposed + - polynomial_size); - double2 *body_res_fft = - (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2; + double2 *mask_res_fft = (double2 *)(glwe_body_decomposed + polynomial_size); + double2 *body_res_fft = + (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2; - double2 *glwe_fft = - (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2); + double2 *glwe_fft = + (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2); - GadgetMatrix gadget(base_log, l_gadget); + GadgetMatrix gadget(base_log, l_gadget); - ///////////////////////////////////// + ///////////////////////////////////// - // glwe2-glwe1 + // glwe2-glwe1 - // Copy m0 to shared memory to preserve data - auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size]; - auto m0_body = m0_mask + polynomial_size; + // Copy m0 to shared memory to preserve data + auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size]; + auto m0_body = m0_mask + polynomial_size; - // Just gets the pointer for m1 on global memory - auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size]; - auto m1_body = m1_mask + polynomial_size; + // Just gets the pointer for m1 on global memory + auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size]; + auto m1_body = m1_mask + polynomial_size; - // Mask - sub_polynomial( - glwe_sub_mask, m1_mask, m0_mask - ); - // Body - sub_polynomial( - glwe_sub_body, m1_body, m0_body - ); + // Mask + sub_polynomial(glwe_sub_mask, m1_mask, m0_mask); + // Body + sub_polynomial(glwe_sub_body, m1_body, m0_body); + + synchronize_threads_in_block(); + + // Initialize the polynomial multiplication via FFT arrays + // The polynomial multiplications happens at the block level + // and each thread handles two or more coefficients + int pos = threadIdx.x; + for (int j = 0; j < params::opt / 2; j++) { + mask_res_fft[pos].x = 0; + mask_res_fft[pos].y = 0; + body_res_fft[pos].x = 0; + body_res_fft[pos].y = 0; + pos += params::degree / params::opt; + } + + // Subtract each glwe operand, decompose the resulting + // polynomial coefficients to multiply each decomposed level + // with the corresponding part of the LUT + for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) { + + // Decomposition + gadget.decompose_one_level(glwe_mask_decomposed, glwe_sub_mask, + decomp_level); + gadget.decompose_one_level(glwe_body_decomposed, glwe_sub_body, + decomp_level); + + // First, perform the polynomial multiplication for the mask + synchronize_threads_in_block(); + fft(glwe_fft, glwe_mask_decomposed); + + // External product and accumulate + // Get the piece necessary for the multiplication + auto mask_fourier = + get_ith_mask_kth_block(ggsw_in, ggsw_idx, 0, decomp_level, + polynomial_size, glwe_dim, l_gadget); + auto body_fourier = + get_ith_body_kth_block(ggsw_in, ggsw_idx, 0, decomp_level, + polynomial_size, glwe_dim, l_gadget); synchronize_threads_in_block(); - // Initialize the polynomial multiplication via FFT arrays - // The polynomial multiplications happens at the block level - // and each thread handles two or more coefficients - int pos = threadIdx.x; - for (int j = 0; j < params::opt / 2; j++) { - mask_res_fft[pos].x = 0; - mask_res_fft[pos].y = 0; - body_res_fft[pos].x = 0; - body_res_fft[pos].y = 0; - pos += params::degree / params::opt; - } - - // Subtract each glwe operand, decompose the resulting - // polynomial coefficients to multiply each decomposed level - // with the corresponding part of the LUT - for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) { - - // Decomposition - gadget.decompose_one_level(glwe_mask_decomposed, - glwe_sub_mask, - decomp_level); - gadget.decompose_one_level(glwe_body_decomposed, - glwe_sub_body, - decomp_level); - - // First, perform the polynomial multiplication for the mask - synchronize_threads_in_block(); - fft(glwe_fft, glwe_mask_decomposed); - - // External product and accumulate - // Get the piece necessary for the multiplication - auto mask_fourier = get_ith_mask_kth_block( - ggsw_in, ggsw_idx, 0, decomp_level, - polynomial_size, glwe_dim, l_gadget); - auto body_fourier = get_ith_body_kth_block( - ggsw_in, ggsw_idx, 0, decomp_level, - polynomial_size, glwe_dim, l_gadget); - - synchronize_threads_in_block(); - - // Perform the coefficient-wise product - synchronize_threads_in_block(); - polynomial_product_accumulate_in_fourier_domain( - mask_res_fft, glwe_fft, mask_fourier); - polynomial_product_accumulate_in_fourier_domain( - body_res_fft, glwe_fft, body_fourier); - - // Now handle the polynomial multiplication for the body - // in the same way - synchronize_threads_in_block(); - fft(glwe_fft, glwe_body_decomposed); - - // External product and accumulate - // Get the piece necessary for the multiplication - mask_fourier = get_ith_mask_kth_block( - ggsw_in, ggsw_idx, 1, decomp_level, - polynomial_size, glwe_dim, l_gadget); - body_fourier = get_ith_body_kth_block( - ggsw_in, ggsw_idx, 1, decomp_level, - polynomial_size, glwe_dim, l_gadget); - - synchronize_threads_in_block(); - - polynomial_product_accumulate_in_fourier_domain( - mask_res_fft, glwe_fft, mask_fourier); - polynomial_product_accumulate_in_fourier_domain( - body_res_fft, glwe_fft, body_fourier); - - } - - // IFFT + // Perform the coefficient-wise product synchronize_threads_in_block(); - ifft_inplace(mask_res_fft); - ifft_inplace(body_res_fft); + polynomial_product_accumulate_in_fourier_domain( + mask_res_fft, glwe_fft, mask_fourier); + polynomial_product_accumulate_in_fourier_domain( + body_res_fft, glwe_fft, body_fourier); + + // Now handle the polynomial multiplication for the body + // in the same way + synchronize_threads_in_block(); + fft(glwe_fft, glwe_body_decomposed); + + // External product and accumulate + // Get the piece necessary for the multiplication + mask_fourier = get_ith_mask_kth_block(ggsw_in, ggsw_idx, 1, decomp_level, + polynomial_size, glwe_dim, l_gadget); + body_fourier = get_ith_body_kth_block(ggsw_in, ggsw_idx, 1, decomp_level, + polynomial_size, glwe_dim, l_gadget); + synchronize_threads_in_block(); - // Write the output - Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size]; - Torus *mb_body = mb_mask + polynomial_size; + polynomial_product_accumulate_in_fourier_domain( + mask_res_fft, glwe_fft, mask_fourier); + polynomial_product_accumulate_in_fourier_domain( + body_res_fft, glwe_fft, body_fourier); + } - int tid = threadIdx.x; - for(int i = 0; i < params::opt; i++){ - mb_mask[tid] = m0_mask[tid]; - mb_body[tid] = m0_body[tid]; - tid += params::degree / params::opt; - } + // IFFT + synchronize_threads_in_block(); + ifft_inplace(mask_res_fft); + ifft_inplace(body_res_fft); + synchronize_threads_in_block(); - add_to_torus(mask_res_fft, mb_mask); - add_to_torus(body_res_fft, mb_body); + // Write the output + Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size]; + Torus *mb_body = mb_mask + polynomial_size; + + int tid = threadIdx.x; + for (int i = 0; i < params::opt; i++) { + mb_mask[tid] = m0_mask[tid]; + mb_body[tid] = m0_body[tid]; + tid += params::degree / params::opt; + } + + add_to_torus(mask_res_fft, mb_mask); + add_to_torus(body_res_fft, mb_body); } /** - * Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW ciphertext. - * The GLWE ciphertexts are picked two-by-two in sequence. Each thread block computes a single CMUX. + * Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW + * ciphertext. The GLWE ciphertexts are picked two-by-two in sequence. Each + * thread block computes a single CMUX. * * - glwe_out: An array where the result should be written to. * - glwe_in: An array where the GLWE inputs are stored. * - ggsw_in: An array where the GGSW input is stored. In the fourier domain. - * - device_mem: An pointer for the global memory in case the shared memory is not big enough to - * store the accumulators. - * - device_memory_size_per_block: Memory size needed to store all accumulators for a single block. + * - device_mem: An pointer for the global memory in case the shared memory is + * not big enough to store the accumulators. + * - device_memory_size_per_block: Memory size needed to store all accumulators + * for a single block. * - glwe_dim: This is k. * - polynomial_size: size of the polynomials. This is N. * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) @@ -242,34 +235,29 @@ __device__ void cmux( * - ggsw_idx: The index of the GGSW we will use. */ template -__global__ void device_batch_cmux( - Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in, - char *device_mem, size_t device_memory_size_per_block, - uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, - uint32_t ggsw_idx){ +__global__ void +device_batch_cmux(Torus *glwe_out, Torus *glwe_in, double2 *ggsw_in, + char *device_mem, size_t device_memory_size_per_block, + uint32_t glwe_dim, uint32_t polynomial_size, + uint32_t base_log, uint32_t l_gadget, uint32_t ggsw_idx) { - int cmux_idx = blockIdx.x; - int output_idx = cmux_idx; - int input_idx1 = (cmux_idx << 1); - int input_idx2 = (cmux_idx << 1) + 1; + int cmux_idx = blockIdx.x; + int output_idx = cmux_idx; + int input_idx1 = (cmux_idx << 1); + int input_idx2 = (cmux_idx << 1) + 1; - // We use shared memory for intermediate result - extern __shared__ char sharedmem[]; - char *selected_memory; + // We use shared memory for intermediate result + extern __shared__ char sharedmem[]; + char *selected_memory; - if constexpr (SMD == FULLSM) - selected_memory = sharedmem; - else - selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block]; - - cmux( - glwe_out, glwe_in, ggsw_in, - selected_memory, - output_idx, input_idx1, input_idx2, - glwe_dim, polynomial_size, - base_log, l_gadget, - ggsw_idx); + if constexpr (SMD == FULLSM) + selected_memory = sharedmem; + else + selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block]; + cmux(glwe_out, glwe_in, ggsw_in, selected_memory, + output_idx, input_idx1, input_idx2, glwe_dim, + polynomial_size, base_log, l_gadget, ggsw_idx); } /* * This kernel executes the CMUX tree used by the hybrid packing of the WoPBS. @@ -279,242 +267,222 @@ __global__ void device_batch_cmux( * - v_stream: The CUDA stream that should be used. * - glwe_out: A device array for the output GLWE ciphertext. * - ggsw_in: A device array for the GGSW ciphertexts used in each layer. - * - lut_vector: A device array for the GLWE ciphertexts used in the first layer. + * - lut_vector: A device array for the GLWE ciphertexts used in the first + * layer. * - polynomial_size: size of the polynomials. This is N. * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) * - l_gadget: number of decomposition levels in the gadget matrix (~4) * - r: Number of layers in the tree. */ template -void host_cmux_tree( - void *v_stream, - Torus *glwe_out, - Torus *ggsw_in, - Torus *lut_vector, - uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t base_log, - uint32_t l_gadget, - uint32_t r, - uint32_t max_shared_memory) { +void host_cmux_tree(void *v_stream, Torus *glwe_out, Torus *ggsw_in, + Torus *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t l_gadget, uint32_t r, uint32_t max_shared_memory) { - auto stream = static_cast(v_stream); - int num_lut = (1<(v_stream); + int num_lut = (1 << r); - cuda_initialize_twiddles(polynomial_size, 0); + cuda_initialize_twiddles(polynomial_size, 0); - int memory_needed_per_block = - sizeof(Torus) * polynomial_size + // glwe_sub_mask - sizeof(Torus) * polynomial_size + // glwe_sub_body - sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed - sizeof(int16_t) * polynomial_size + // glwe_body_decomposed - sizeof(double2) * polynomial_size/2 + // mask_res_fft - sizeof(double2) * polynomial_size/2 + // body_res_fft - sizeof(double2) * polynomial_size/2; // glwe_fft + int memory_needed_per_block = + sizeof(Torus) * polynomial_size + // glwe_sub_mask + sizeof(Torus) * polynomial_size + // glwe_sub_body + sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed + sizeof(int16_t) * polynomial_size + // glwe_body_decomposed + sizeof(double2) * polynomial_size / 2 + // mask_res_fft + sizeof(double2) * polynomial_size / 2 + // body_res_fft + sizeof(double2) * polynomial_size / 2; // glwe_fft - dim3 thds(polynomial_size / params::opt, 1, 1); + dim3 thds(polynomial_size / params::opt, 1, 1); - ////////////////////// -// std::cout << "Applying the FFT on m^tree" << std::endl; - double2 *d_ggsw_fft_in; - int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget; + ////////////////////// + double2 *d_ggsw_fft_in; + int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * + (glwe_dimension + 1) * l_gadget; - #if (CUDART_VERSION < 11020) - checkCudaErrors(cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double))); - #else - checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double), *stream)); - #endif +#if (CUDART_VERSION < 11020) + checkCudaErrors( + cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double))); +#else + checkCudaErrors(cudaMallocAsync((void **)&d_ggsw_fft_in, + ggsw_size * sizeof(double), *stream)); +#endif - batch_fft_ggsw_vector( - v_stream, d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, l_gadget); + batch_fft_ggsw_vector(v_stream, d_ggsw_fft_in, ggsw_in, + r, glwe_dimension, + polynomial_size, l_gadget); - ////////////////////// + ////////////////////// - // Allocate global memory in case parameters are too large - char *d_mem; - if (max_shared_memory < memory_needed_per_block) { - #if (CUDART_VERSION < 11020) - checkCudaErrors(cudaMalloc((void **) &d_mem, memory_needed_per_block * (1 << (r - 1)))); - #else - checkCudaErrors(cudaMallocAsync((void **) &d_mem, memory_needed_per_block * (1 << (r - 1)), *stream)); - #endif - }else{ - checkCudaErrors(cudaFuncSetAttribute( - device_batch_cmux, - cudaFuncAttributeMaxDynamicSharedMemorySize, - memory_needed_per_block)); - checkCudaErrors(cudaFuncSetCacheConfig( - device_batch_cmux, - cudaFuncCachePreferShared)); - } + // Allocate global memory in case parameters are too large + char *d_mem; + if (max_shared_memory < memory_needed_per_block) { +#if (CUDART_VERSION < 11020) + checkCudaErrors( + cudaMalloc((void **)&d_mem, memory_needed_per_block * (1 << (r - 1)))); +#else + checkCudaErrors(cudaMallocAsync( + (void **)&d_mem, memory_needed_per_block * (1 << (r - 1)), *stream)); +#endif + } else { + checkCudaErrors(cudaFuncSetAttribute( + device_batch_cmux, + cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block)); + checkCudaErrors( + cudaFuncSetCacheConfig(device_batch_cmux, + cudaFuncCachePreferShared)); + } - // Allocate buffers - int glwe_size = (glwe_dimension + 1) * polynomial_size; - Torus *d_buffer1, *d_buffer2; + // Allocate buffers + int glwe_size = (glwe_dimension + 1) * polynomial_size; + Torus *d_buffer1, *d_buffer2; - #if (CUDART_VERSION < 11020) - checkCudaErrors(cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus))); - checkCudaErrors(cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus))); - #else - checkCudaErrors(cudaMallocAsync((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream)); - checkCudaErrors(cudaMallocAsync((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream)); - #endif - checkCudaErrors(cudaMemcpyAsync( - d_buffer1, lut_vector, - num_lut * glwe_size * sizeof(Torus), - cudaMemcpyDeviceToDevice, *stream)); +#if (CUDART_VERSION < 11020) + checkCudaErrors( + cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus))); + checkCudaErrors( + cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus))); +#else + checkCudaErrors(cudaMallocAsync( + (void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus), *stream)); + checkCudaErrors(cudaMallocAsync( + (void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus), *stream)); +#endif + checkCudaErrors(cudaMemcpyAsync(d_buffer1, lut_vector, + num_lut * glwe_size * sizeof(Torus), + cudaMemcpyDeviceToDevice, *stream)); - Torus *output; - // Run the cmux tree - for(int layer_idx = 0; layer_idx < r; layer_idx++){ - output = (layer_idx % 2? d_buffer1 : d_buffer2); - Torus *input = (layer_idx % 2? d_buffer2 : d_buffer1); + Torus *output; + // Run the cmux tree + for (int layer_idx = 0; layer_idx < r; layer_idx++) { + output = (layer_idx % 2 ? d_buffer1 : d_buffer2); + Torus *input = (layer_idx % 2 ? d_buffer2 : d_buffer1); - int num_cmuxes = (1<<(r-1-layer_idx)); - dim3 grid(num_cmuxes, 1, 1); + int num_cmuxes = (1 << (r - 1 - layer_idx)); + dim3 grid(num_cmuxes, 1, 1); - // walks horizontally through the leafs - if(max_shared_memory < memory_needed_per_block) - device_batch_cmux - <<>>( - output, input, d_ggsw_fft_in, - d_mem, memory_needed_per_block, - glwe_dimension, // k - polynomial_size, base_log, l_gadget, - layer_idx // r - ); - else - device_batch_cmux - <<>>( - output, input, d_ggsw_fft_in, - d_mem, memory_needed_per_block, - glwe_dimension, // k - polynomial_size, base_log, l_gadget, - layer_idx // r - ); + // walks horizontally through the leafs + if (max_shared_memory < memory_needed_per_block) + device_batch_cmux + <<>>( + output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block, + glwe_dimension, // k + polynomial_size, base_log, l_gadget, + layer_idx // r + ); + else + device_batch_cmux + <<>>( + output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block, + glwe_dimension, // k + polynomial_size, base_log, l_gadget, + layer_idx // r + ); + } - } + checkCudaErrors(cudaMemcpyAsync( + glwe_out, output, (glwe_dimension + 1) * polynomial_size * sizeof(Torus), + cudaMemcpyDeviceToDevice, *stream)); - checkCudaErrors(cudaMemcpyAsync( - glwe_out, output, - (glwe_dimension+1) * polynomial_size * sizeof(Torus), - cudaMemcpyDeviceToDevice, *stream)); - - // We only need synchronization to assert that data is in glwe_out before - // returning. Memory release can be added to the stream and processed - // later. - checkCudaErrors(cudaStreamSynchronize(*stream)); - - // Free memory - #if (CUDART_VERSION < 11020) - checkCudaErrors(cudaFree(d_ggsw_fft_in)); - checkCudaErrors(cudaFree(d_buffer1)); - checkCudaErrors(cudaFree(d_buffer2)); - if(max_shared_memory < memory_needed_per_block) - checkCudaErrors(cudaFree(d_mem)); - #else - checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream)); - checkCudaErrors(cudaFreeAsync(d_buffer1, *stream)); - checkCudaErrors(cudaFreeAsync(d_buffer2, *stream)); - if(max_shared_memory < memory_needed_per_block) - checkCudaErrors(cudaFreeAsync(d_mem, *stream)); - #endif + // We only need synchronization to assert that data is in glwe_out before + // returning. Memory release can be added to the stream and processed + // later. + checkCudaErrors(cudaStreamSynchronize(*stream)); +// Free memory +#if (CUDART_VERSION < 11020) + checkCudaErrors(cudaFree(d_ggsw_fft_in)); + checkCudaErrors(cudaFree(d_buffer1)); + checkCudaErrors(cudaFree(d_buffer2)); + if (max_shared_memory < memory_needed_per_block) + checkCudaErrors(cudaFree(d_mem)); +#else + checkCudaErrors(cudaFreeAsync(d_ggsw_fft_in, *stream)); + checkCudaErrors(cudaFreeAsync(d_buffer1, *stream)); + checkCudaErrors(cudaFreeAsync(d_buffer2, *stream)); + if (max_shared_memory < memory_needed_per_block) + checkCudaErrors(cudaFreeAsync(d_mem, *stream)); +#endif } - - // only works for big lwe for ks+bs case // state_lwe_buffer is copied from big lwe input // shifted_lwe_buffer is scalar multiplication of lwe input // blockIdx.x refers to input ciphertext id template __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift, - Torus *src, Torus value) -{ - int blockId = blockIdx.x; - int tid = threadIdx.x; - auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)]; - auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)]; - auto cur_src = &src[blockId * (params::degree + 1)]; - + Torus *src, Torus value) { + int blockId = blockIdx.x; + int tid = threadIdx.x; + auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)]; + auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)]; + auto cur_src = &src[blockId * (params::degree + 1)]; + #pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_dst_copy[tid] = cur_src[tid]; - cur_dst_shift[tid] = cur_src[tid] * value; - tid += params::degree / params::opt; - } + for (int i = 0; i < params::opt; i++) { + cur_dst_copy[tid] = cur_src[tid]; + cur_dst_shift[tid] = cur_src[tid] * value; + tid += params::degree / params::opt; + } - if (threadIdx.x == params::degree / params::opt - 1) { - cur_dst_copy[params::degree] = cur_src[params::degree]; - cur_dst_shift[params::degree] = cur_src[params::degree] * value; - } + if (threadIdx.x == params::degree / params::opt - 1) { + cur_dst_copy[params::degree] = cur_src[params::degree]; + cur_dst_shift[params::degree] = cur_src[params::degree] * value; + } } - // only works for small lwe in ks+bs case // function copies lwe when length is not a power of two template -__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, uint32_t number_of_bits, - uint32_t lwe_id) -{ +__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, + uint32_t number_of_bits, uint32_t lwe_id) { + size_t blockId = blockIdx.x; + size_t threads_per_block = blockDim.x; + size_t opt = small_lwe_size / threads_per_block; + size_t rem = small_lwe_size & (threads_per_block - 1); + auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits]; + auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size]; + auto cur_src = &src[blockId * small_lwe_size]; - size_t blockId = blockIdx.x; - size_t threads_per_block = blockDim.x; - size_t opt = small_lwe_size / threads_per_block; - size_t rem = small_lwe_size & (threads_per_block - 1); - - auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits]; - auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size]; - auto cur_src = &src[blockId * small_lwe_size]; - - size_t tid = threadIdx.x; - for (int i = 0; i < opt; i++) { - cur_dst[tid] = cur_src[tid]; - tid += threads_per_block; - } - - if (threadIdx.x < rem) - cur_dst[tid] = cur_src[tid]; - + size_t tid = threadIdx.x; + for (int i = 0; i < opt; i++) { + cur_dst[tid] = cur_src[tid]; + tid += threads_per_block; + } + if (threadIdx.x < rem) + cur_dst[tid] = cur_src[tid]; } - // only used in extract bits for one ciphertext // should be called with one block and one thread // NOTE: check if putting this functionality in copy_small_lwe or // fill_pbs_lut vector is faster template -__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, - Torus value) { - lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value; - +__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) { + lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value; } - - // Fill lut(only body) for the current bit (equivalent to trivial encryption as // mask is 0s) -// The LUT is filled with -alpha in each coefficient where alpha = delta*2^{bit_idx-1} +// The LUT is filled with -alpha in each coefficient where alpha = +// delta*2^{bit_idx-1} template -__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) -{ - Torus *cur_poly = &lut[params::degree]; - size_t tid = threadIdx.x; +__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) { + Torus *cur_poly = &lut[params::degree]; + size_t tid = threadIdx.x; #pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_poly[tid] = value; - tid += params::degree / params::opt; - } + for (int i = 0; i < params::opt; i++) { + cur_poly[tid] = value; + tid += params::degree / params::opt; + } } - - -// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 if the -// extracted bit was 0 and 1 in the other case +// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 +// if the extracted bit was 0 and 1 in the other case // // Remove the extracted bit from the state LWE to get a 0 at the extracted bit // location. @@ -525,8 +493,7 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) template __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe, Torus *pbs_lwe_out, Torus add_value, - Torus mul_value) -{ + Torus mul_value) { size_t tid = threadIdx.x; size_t blockId = blockIdx.x; auto cur_shifted_lwe = &shifted_lwe[blockId * (params::degree + 1)]; @@ -546,81 +513,57 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe, } } - template __host__ void host_extract_bits( - void *v_stream, - Torus *list_lwe_out, - Torus *lwe_in, - Torus *lwe_in_buffer, - Torus *lwe_in_shifted_buffer, - Torus *lwe_out_ks_buffer, - Torus *lwe_out_pbs_buffer, - Torus *lut_pbs, - uint32_t *lut_vector_indexes, - Torus *ksk, - double2 *fourier_bsk, - uint32_t number_of_bits, - uint32_t delta_log, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log_bsk, - uint32_t l_gadget_bsk, - uint32_t base_log_ksk, - uint32_t l_gadget_ksk, - uint32_t number_of_samples) -{ - auto stream = static_cast(v_stream); - uint32_t ciphertext_n_bits = sizeof(Torus) * 8; + void *v_stream, Torus *list_lwe_out, Torus *lwe_in, Torus *lwe_in_buffer, + Torus *lwe_in_shifted_buffer, Torus *lwe_out_ks_buffer, + Torus *lwe_out_pbs_buffer, Torus *lut_pbs, uint32_t *lut_vector_indexes, + Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits, + uint32_t delta_log, uint32_t lwe_dimension_before, + uint32_t lwe_dimension_after, uint32_t base_log_bsk, uint32_t l_gadget_bsk, + uint32_t base_log_ksk, uint32_t l_gadget_ksk, uint32_t number_of_samples) { - int blocks = 1; - int threads = params::degree / params::opt; + auto stream = static_cast(v_stream); + uint32_t ciphertext_n_bits = sizeof(Torus) * 8; - copy_and_shift_lwe<<>> - (lwe_in_buffer, lwe_in_shifted_buffer, lwe_in, - 1ll << (ciphertext_n_bits - delta_log - 1)); + int blocks = 1; + int threads = params::degree / params::opt; - for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) { - cuda_keyswitch_lwe_ciphertext_vector(v_stream, lwe_out_ks_buffer, - lwe_in_shifted_buffer, ksk, - lwe_dimension_before, - lwe_dimension_after, base_log_ksk, - l_gadget_ksk, 1); + copy_and_shift_lwe<<>>( + lwe_in_buffer, lwe_in_shifted_buffer, lwe_in, + 1ll << (ciphertext_n_bits - delta_log - 1)); - copy_small_lwe<<<1, 256, 0, *stream>>>(list_lwe_out, - lwe_out_ks_buffer, - lwe_dimension_after + 1, - number_of_bits, - number_of_bits - bit_idx - 1); + for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) { + cuda_keyswitch_lwe_ciphertext_vector( + v_stream, lwe_out_ks_buffer, lwe_in_shifted_buffer, ksk, + lwe_dimension_before, lwe_dimension_after, base_log_ksk, l_gadget_ksk, + 1); - if (bit_idx == number_of_bits - 1) { - break; - } + copy_small_lwe<<<1, 256, 0, *stream>>>( + list_lwe_out, lwe_out_ks_buffer, lwe_dimension_after + 1, + number_of_bits, number_of_bits - bit_idx - 1); - add_to_body<<<1, 1, 0, *stream>>>(lwe_out_ks_buffer, - lwe_dimension_after, - 1ll << (ciphertext_n_bits - 2)); - - - fill_lut_body_for_current_bit - <<>> (lut_pbs, 0ll - 1ll << ( - delta_log - 1 + - bit_idx)); - - host_bootstrap_low_latency(v_stream, lwe_out_pbs_buffer, - lut_pbs, lut_vector_indexes, - lwe_out_ks_buffer, fourier_bsk, - lwe_dimension_after, lwe_dimension_before, - base_log_bsk, l_gadget_bsk, number_of_samples, - 1); - - add_sub_and_mul_lwe<<<1, threads, 0, *stream>>>( - lwe_in_shifted_buffer, lwe_in_buffer, lwe_out_pbs_buffer, - 1ll << (delta_log - 1 + bit_idx), - 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) ); + if (bit_idx == number_of_bits - 1) { + break; } + add_to_body<<<1, 1, 0, *stream>>>( + lwe_out_ks_buffer, lwe_dimension_after, 1ll << (ciphertext_n_bits - 2)); + + fill_lut_body_for_current_bit + <<>>( + lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx)); + + host_bootstrap_low_latency( + v_stream, lwe_out_pbs_buffer, lut_pbs, lut_vector_indexes, + lwe_out_ks_buffer, fourier_bsk, lwe_dimension_after, + lwe_dimension_before, base_log_bsk, l_gadget_bsk, number_of_samples, 1); + + add_sub_and_mul_lwe<<<1, threads, 0, *stream>>>( + lwe_in_shifted_buffer, lwe_in_buffer, lwe_out_pbs_buffer, + 1ll << (delta_log - 1 + bit_idx), + 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)); + } } - -#endif //WO_PBS_H +#endif // WO_PBS_H diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index 528fa6796..52545e05a 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -10,26 +10,29 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, int glwe_dimension, uint32_t l_gadget) { - return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget; + return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * + l_gadget; } template -__device__ T* -get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget) { +__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level, + uint32_t polynomial_size, + int glwe_dimension, uint32_t l_gadget) { return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) + - level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) + - k * polynomial_size / 2 * (glwe_dimension + 1)]; + level * polynomial_size / 2 * (glwe_dimension + 1) * + (glwe_dimension + 1) + + k * polynomial_size / 2 * (glwe_dimension + 1)]; } template -__device__ T* -get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget) { - return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) + - level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) + - k * polynomial_size / 2 * (glwe_dimension + 1) + - polynomial_size / 2]; +__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level, + uint32_t polynomial_size, + int glwe_dimension, uint32_t l_gadget) { + return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) + + level * polynomial_size / 2 * (glwe_dimension + 1) * + (glwe_dimension + 1) + + k * polynomial_size / 2 * (glwe_dimension + 1) + + polynomial_size / 2]; } void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) { @@ -65,21 +68,21 @@ void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) { template void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, - uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim, - uint32_t l_gadget, uint32_t polynomial_size) { + uint32_t gpu_index, uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t l_gadget, + uint32_t polynomial_size) { cudaSetDevice(gpu_index); int shared_memory_size = sizeof(double) * polynomial_size; int total_polynomials = - input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * - l_gadget; + input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * l_gadget; // Here the buffer size is the size of double2 times the number of polynomials // times the polynomial size over 2 because the polynomials are compressed // into the complex domain to perform the FFT - size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof - (double2); + size_t buffer_size = + total_polynomials * polynomial_size / 2 * sizeof(double2); int gridSize = total_polynomials; int blockSize = polynomial_size / choose_opt(polynomial_size); @@ -110,23 +113,23 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, switch (polynomial_size) { case 512: batch_NSMFFT, ForwardFFT>> - <<>>(d_bsk, dest); + <<>>(d_bsk, dest); break; case 1024: batch_NSMFFT, ForwardFFT>> - <<>>(d_bsk, dest); + <<>>(d_bsk, dest); break; case 2048: batch_NSMFFT, ForwardFFT>> - <<>>(d_bsk, dest); + <<>>(d_bsk, dest); break; case 4096: batch_NSMFFT, ForwardFFT>> - <<>>(d_bsk, dest); + <<>>(d_bsk, dest); break; case 8192: batch_NSMFFT, ForwardFFT>> - <<>>(d_bsk, dest); + <<>>(d_bsk, dest); break; default: break; @@ -134,44 +137,58 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, cudaFree(d_bsk); free(h_bsk); - } void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream, - uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim, - uint32_t l_gadget, uint32_t polynomial_size) { - cuda_convert_lwe_bootstrap_key((double2 *)dest, (int32_t *)src, - v_stream, gpu_index, input_lwe_dim, - glwe_dim, l_gadget, polynomial_size); + uint32_t gpu_index, + uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t l_gadget, + uint32_t polynomial_size) { + cuda_convert_lwe_bootstrap_key( + (double2 *)dest, (int32_t *)src, v_stream, gpu_index, input_lwe_dim, + glwe_dim, l_gadget, polynomial_size); } void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream, - uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim, - uint32_t l_gadget, uint32_t polynomial_size) { - cuda_convert_lwe_bootstrap_key((double2 *)dest, (int64_t *)src, - v_stream, gpu_index, input_lwe_dim, - glwe_dim, l_gadget, polynomial_size); + uint32_t gpu_index, + uint32_t input_lwe_dim, + uint32_t glwe_dim, uint32_t l_gadget, + uint32_t polynomial_size) { + cuda_convert_lwe_bootstrap_key( + (double2 *)dest, (int64_t *)src, v_stream, gpu_index, input_lwe_dim, + glwe_dim, l_gadget, polynomial_size); } - // We need these lines so the compiler knows how to specialize these functions -template __device__ uint64_t* -get_ith_mask_kth_block(uint64_t* ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); -template __device__ uint32_t* -get_ith_mask_kth_block(uint32_t* ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); -template __device__ double2* -get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); -template __device__ uint64_t* -get_ith_body_kth_block(uint64_t *ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); -template __device__ uint32_t* -get_ith_body_kth_block(uint32_t *ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); -template __device__ double2* -get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size, - int glwe_dimension, uint32_t l_gadget); +template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t l_gadget); +template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t l_gadget); +template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k, + int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t l_gadget); +template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t l_gadget); +template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i, + int k, int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t l_gadget); +template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k, + int level, + uint32_t polynomial_size, + int glwe_dimension, + uint32_t l_gadget); #endif // CNCRT_BSK_H diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh index ca7305bfc..a5b5ec567 100644 --- a/src/crypto/ggsw.cuh +++ b/src/crypto/ggsw.cuh @@ -2,51 +2,49 @@ #define CONCRETE_CORE_GGSW_CUH template -__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src){ +__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src) { - extern __shared__ char sharedmem[]; + extern __shared__ char sharedmem[]; - double2 *shared_output = (double2*) sharedmem; + double2 *shared_output = (double2 *)sharedmem; - // Compression - int offset = blockIdx.x * blockDim.x; - int tid = threadIdx.x; - #pragma unroll - for (int i = 0; i < params::opt >> 1; i++) { - ST x = src[(2 * tid) + params::opt * offset]; - ST y = src[(2 * tid + 1) + params::opt * offset]; - shared_output[tid].x = x / (double)std::numeric_limits::max(); - shared_output[tid].y = y / (double)std::numeric_limits::max(); - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); + // Compression + int offset = blockIdx.x * blockDim.x; + int tid = threadIdx.x; + int log_2_opt = params::opt >> 1; +#pragma unroll + for (int i = 0; i < log_2_opt; i++) { + ST x = src[(2 * tid) + params::opt * offset]; + ST y = src[(2 * tid + 1) + params::opt * offset]; + shared_output[tid].x = x / (double)std::numeric_limits::max(); + shared_output[tid].y = y / (double)std::numeric_limits::max(); + tid += params::degree / params::opt; + } + synchronize_threads_in_block(); - // Switch to the FFT space - NSMFFT_direct>(shared_output); - synchronize_threads_in_block(); + // Switch to the FFT space + NSMFFT_direct>(shared_output); + synchronize_threads_in_block(); - correction_direct_fft_inplace(shared_output); - synchronize_threads_in_block(); + correction_direct_fft_inplace(shared_output); + synchronize_threads_in_block(); - // Write the output to global memory - tid = threadIdx.x; - for (int j = 0; j < params::opt >> 1; j++) { - dest[tid + (params::opt >> 1) * offset] = shared_output[tid]; - tid += params::degree / params::opt; - } + // Write the output to global memory + tid = threadIdx.x; + for (int j = 0; j < log_2_opt; j++) { + dest[tid + (params::opt >> 1) * offset] = shared_output[tid]; + tid += params::degree / params::opt; + } } /** - * Applies the FFT transform on sequence of GGSW ciphertexts already in the global memory + * Applies the FFT transform on sequence of GGSW ciphertexts already in the + * global memory */ template -void batch_fft_ggsw_vector( - void *v_stream, - double2 *dest, T *src, - uint32_t r, - uint32_t glwe_dim, - uint32_t polynomial_size, - uint32_t l_gadget) { +void batch_fft_ggsw_vector(void *v_stream, double2 *dest, T *src, uint32_t r, + uint32_t glwe_dim, uint32_t polynomial_size, + uint32_t l_gadget) { auto stream = static_cast(v_stream); @@ -56,11 +54,9 @@ void batch_fft_ggsw_vector( int gridSize = total_polynomials; int blockSize = polynomial_size / params::opt; - batch_fft_ggsw_vectors<<>>(dest, - src); + batch_fft_ggsw_vectors + <<>>(dest, src); checkCudaErrors(cudaGetLastError()); - } - -#endif //CONCRETE_CORE_GGSW_CUH +#endif // CONCRETE_CORE_GGSW_CUH diff --git a/src/device.cu b/src/device.cu index d8a1f7b1b..c3f8dd11b 100644 --- a/src/device.cu +++ b/src/device.cu @@ -62,7 +62,7 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, // error code: zero copy size return -3; } - + if (gpu_index >= cuda_get_number_of_gpus()) { // error code: invalid gpu_index return -2; @@ -75,8 +75,8 @@ int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, } auto stream = static_cast(v_stream); cudaSetDevice(gpu_index); - checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, - *stream)); + checkCudaErrors( + cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream)); return 0; } @@ -117,8 +117,8 @@ int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, } auto stream = static_cast(v_stream); cudaSetDevice(gpu_index); - checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, - *stream)); + checkCudaErrors( + cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream)); return 0; } diff --git a/src/keyswitch.cu b/src/keyswitch.cu index d713aa839..e7b1179aa 100644 --- a/src/keyswitch.cu +++ b/src/keyswitch.cu @@ -14,18 +14,15 @@ * This function calls a wrapper to a device kernel that performs the keyswitch * - num_samples blocks of threads are launched */ -void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in, - void *ksk, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log, uint32_t l_gadget, - uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( - v_stream, static_cast(lwe_out), static_cast(lwe_in), - static_cast(ksk), - lwe_dimension_before, lwe_dimension_after, - base_log, l_gadget, - num_samples); +void cuda_keyswitch_lwe_ciphertext_vector_32( + void *v_stream, void *lwe_out, void *lwe_in, void *ksk, + uint32_t lwe_dimension_before, uint32_t lwe_dimension_after, + uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) { + cuda_keyswitch_lwe_ciphertext_vector( + v_stream, static_cast(lwe_out), + static_cast(lwe_in), static_cast(ksk), + lwe_dimension_before, lwe_dimension_after, base_log, l_gadget, + num_samples); } /* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits @@ -38,18 +35,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void * This function calls a wrapper to a device kernel that performs the keyswitch * - num_samples blocks of threads are launched */ -void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in, - void *ksk, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log, uint32_t l_gadget, - uint32_t num_samples) { - cuda_keyswitch_lwe_ciphertext_vector( - v_stream, static_cast(lwe_out), static_cast (lwe_in), - static_cast(ksk), - lwe_dimension_before, lwe_dimension_after, - base_log, l_gadget, - num_samples); +void cuda_keyswitch_lwe_ciphertext_vector_64( + void *v_stream, void *lwe_out, void *lwe_in, void *ksk, + uint32_t lwe_dimension_before, uint32_t lwe_dimension_after, + uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) { + cuda_keyswitch_lwe_ciphertext_vector( + v_stream, static_cast(lwe_out), + static_cast(lwe_in), static_cast(ksk), + lwe_dimension_before, lwe_dimension_after, base_log, l_gadget, + num_samples); } - - diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh index d3d45d60b..2524bccce 100644 --- a/src/keyswitch.cuh +++ b/src/keyswitch.cuh @@ -9,24 +9,23 @@ template __device__ Torus *get_ith_block(Torus *ksk, int i, int level, - uint32_t lwe_dimension_after, - uint32_t l_gadget) { - int pos = i * l_gadget * (lwe_dimension_after + 1) + - level * (lwe_dimension_after + 1); - Torus *ptr = &ksk[pos]; - return ptr; + uint32_t lwe_dimension_after, + uint32_t l_gadget) { + int pos = i * l_gadget * (lwe_dimension_after + 1) + + level * (lwe_dimension_after + 1); + Torus *ptr = &ksk[pos]; + return ptr; } template -__device__ Torus decompose_one(Torus &state, Torus mod_b_mask, - int base_log) { - Torus res = state & mod_b_mask; - state >>= base_log; - Torus carry = ((res - 1ll) | state) & res; - carry >>= base_log - 1; - state += carry; - res -= carry << base_log; - return res; +__device__ Torus decompose_one(Torus &state, Torus mod_b_mask, int base_log) { + Torus res = state & mod_b_mask; + state >>= base_log; + Torus carry = ((res - 1ll) | state) & res; + carry >>= base_log - 1; + state += carry; + res -= carry << base_log; + return res; } /* @@ -43,23 +42,19 @@ __device__ Torus decompose_one(Torus &state, Torus mod_b_mask, * */ template -__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in, - Torus *ksk, +__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in, Torus *ksk, uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log, - uint32_t l_gadget, - int lwe_lower, int lwe_upper, int cutoff) { + uint32_t lwe_dimension_after, uint32_t base_log, + uint32_t l_gadget, int lwe_lower, int lwe_upper, + int cutoff) { int tid = threadIdx.x; extern __shared__ char sharedmem[]; Torus *local_lwe_out = (Torus *)sharedmem; - auto block_lwe_in = - get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1); - auto block_lwe_out = - get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1); + auto block_lwe_in = get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1); + auto block_lwe_out = get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1); auto gadget = GadgetMatrixSingle(base_log, l_gadget); @@ -77,26 +72,22 @@ __global__ void keyswitch(Torus *lwe_out, Torus *lwe_in, } if (tid == 0) { - local_lwe_out[lwe_dimension_after] = - block_lwe_in[lwe_dimension_before]; + local_lwe_out[lwe_dimension_after] = block_lwe_in[lwe_dimension_before]; } for (int i = 0; i < lwe_dimension_before; i++) { __syncthreads(); - Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log, - l_gadget); + Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log, l_gadget); Torus state = a_i >> (sizeof(Torus) * 8 - base_log * l_gadget); Torus mod_b_mask = (1ll << base_log) - 1ll; for (int j = 0; j < l_gadget; j++) { auto ksk_block = get_ith_block(ksk, i, l_gadget - j - 1, - lwe_dimension_after, - l_gadget); - Torus decomposed = decompose_one(state, mod_b_mask, - base_log); + lwe_dimension_after, l_gadget); + Torus decomposed = decompose_one(state, mod_b_mask, base_log); for (int k = 0; k < lwe_part_per_thd; k++) { int idx = tid + k * blockDim.x; local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed; @@ -112,13 +103,10 @@ __global__ void keyswitch(Torus *lwe_out, Torus *lwe_in, /// assume lwe_in in the gpu template -__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in, - Torus *ksk, - uint32_t lwe_dimension_before, - uint32_t lwe_dimension_after, - uint32_t base_log, - uint32_t l_gadget, - uint32_t num_samples) { +__host__ void cuda_keyswitch_lwe_ciphertext_vector( + void *v_stream, Torus *lwe_out, Torus *lwe_in, Torus *ksk, + uint32_t lwe_dimension_before, uint32_t lwe_dimension_after, + uint32_t base_log, uint32_t l_gadget, uint32_t num_samples) { constexpr int ideal_threads = 128; @@ -136,11 +124,9 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads); } - int lwe_size_after = - (lwe_dimension_after + 1) * num_samples; + int lwe_size_after = (lwe_dimension_after + 1) * num_samples; - int shared_mem = - sizeof(Torus) * (lwe_dimension_after + 1); + int shared_mem = sizeof(Torus) * (lwe_dimension_after + 1); cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after); @@ -156,7 +142,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou l_gadget, lwe_lower, lwe_upper, cutoff); cudaStreamSynchronize(*stream); - } #endif diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh index 5606e093a..87d59ab94 100644 --- a/src/polynomial/polynomial.cuh +++ b/src/polynomial/polynomial.cuh @@ -497,7 +497,6 @@ public: } synchronize_threads_in_block(); } - }; template class Vector { public: diff --git a/src/polynomial/polynomial_math.cuh b/src/polynomial/polynomial_math.cuh index 487030775..a36667b9a 100644 --- a/src/polynomial/polynomial_math.cuh +++ b/src/polynomial/polynomial_math.cuh @@ -30,9 +30,10 @@ __device__ void polynomial_product_in_fourier_domain(FT *result, FT *first, } template -__device__ void polynomial_product_in_fourier_domain( - PolynomialFourier &result, PolynomialFourier &first, - PolynomialFourier &second) { +__device__ void +polynomial_product_in_fourier_domain(PolynomialFourier &result, + PolynomialFourier &first, + PolynomialFourier &second) { int tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { result[tid] = first[tid] * second[tid]; @@ -72,8 +73,9 @@ __device__ void polynomial_product_accumulate_in_fourier_domain( } template -__device__ void polynomial_product_accumulate_in_fourier_domain( - T *result, T *first, T *second) { +__device__ void polynomial_product_accumulate_in_fourier_domain(T *result, + T *first, + T *second) { int tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { result[tid] += first[tid] * second[tid];