From bfb07b961dc861e4659417e052a29b458fab15c5 Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Mon, 13 Feb 2023 19:58:46 -0300 Subject: [PATCH] feat(cuda): Add support for the classical PBS for polynomial_size=256. --- src/bootstrap_amortized.cu | 38 +++++++++++++++++++++++--------- src/bootstrap_low_latency.cu | 38 +++++++++++++++++++++++--------- src/crypto/bootstrapping_key.cuh | 19 ++++++++++++++++ 3 files changed, 75 insertions(+), 20 deletions(-) diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu index 61e94ca94..79da3e8f9 100644 --- a/src/bootstrap_amortized.cu +++ b/src/bootstrap_amortized.cu @@ -13,13 +13,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( assert( ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32)); - assert(("Error (GPU amortized PBS): polynomial size should be one of 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + assert( + ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); switch (polynomial_size) { + case 256: + host_bootstrap_amortized>( + v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); + break; case 512: host_bootstrap_amortized>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, @@ -139,13 +148,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( assert( ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64)); - assert(("Error (GPU amortized PBS): polynomial size should be one of 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + assert( + ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); switch (polynomial_size) { + case 256: + host_bootstrap_amortized>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); + break; case 512: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 662f29779..e9e16bf8e 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -16,11 +16,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( assert(("Error (GPU low latency PBS): base log should be <= 32", base_log <= 32)); - assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + assert(( + "Error (GPU low latency PBS): polynomial size should be one of 256, 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // The number of samples should be lower than 4 * SM/((k + 1) * l) (the // factor 4 being related to the occupancy of 50%). The only supported // value for k is 1, so k + 1 = 2 for now. @@ -33,6 +34,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( num_samples <= number_of_sm * 4. / 2. / level_count)); switch (polynomial_size) { + case 256: + host_bootstrap_low_latency>( + v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); + break; case 512: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, @@ -163,11 +172,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( assert(("Error (GPU low latency PBS): base log should be <= 64", base_log <= 64)); - assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " - "1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); + assert(( + "Error (GPU low latency PBS): polynomial size should be one of 256, 512, " + "1024, 2048, 4096, 8192", + polynomial_size == 256 || polynomial_size == 512 || + polynomial_size == 1024 || polynomial_size == 2048 || + polynomial_size == 4096 || polynomial_size == 8192)); // The number of samples should be lower than 4 * SM/((k + 1) * l) (the // factor 4 being related to the occupancy of 50%). The only supported // value for k is 1, so k + 1 = 2 for now. @@ -180,6 +190,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( num_samples <= number_of_sm * 4. / 2. / level_count)); switch (polynomial_size) { + case 256: + host_bootstrap_low_latency>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, + (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); + break; case 512: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index 4ed497e4b..51d5f247a 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -84,6 +84,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, double2 *buffer; switch (polynomial_size) { + case 256: + if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); + check_cuda_error(cudaFuncSetAttribute( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); + check_cuda_error(cudaFuncSetCacheConfig( + batch_NSMFFT, ForwardFFT>, FULLSM>, + cudaFuncCachePreferShared)); + batch_NSMFFT, ForwardFFT>, FULLSM> + <<>>(d_bsk, dest, + buffer); + } else { + buffer = (double2 *)cuda_malloc_async( + shared_memory_size * total_polynomials, stream, gpu_index); + batch_NSMFFT, ForwardFFT>, NOSM> + <<>>(d_bsk, dest, buffer); + } + break; case 512: if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);