diff --git a/include/bootstrap.h b/include/bootstrap.h index fc26c9325..cd76c8ca5 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -73,8 +73,8 @@ void cuda_extract_bits_32( void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, - uint32_t number_of_samples); + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory); void cuda_extract_bits_64( void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, @@ -84,8 +84,8 @@ void cuda_extract_bits_64( void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, - uint32_t number_of_samples); + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory); } #ifdef __CUDACC__ diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 3cae50c9d..7adc8da00 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -68,9 +68,10 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", glwe_dimension == 1)); assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " - "1024, 2048", + "1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048)); + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); // The number of samples should be lower than SM/(4 * (k + 1) * l) (the // factor 4 being related to the occupancy of 50%). The only supported // value for k is 1, so k + 1 = 2 for now. @@ -88,21 +89,35 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors); + level_count, num_samples, num_lut_vectors, max_shared_memory); break; case 1024: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors); + level_count, num_samples, num_lut_vectors, max_shared_memory); break; case 2048: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors); + level_count, num_samples, num_lut_vectors, max_shared_memory); + break; + case 4096: + host_bootstrap_low_latency>( + v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_lut_vectors, max_shared_memory); + break; + case 8192: + host_bootstrap_low_latency>( + v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_lut_vectors, max_shared_memory); break; default: break; @@ -121,9 +136,10 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", glwe_dimension == 1)); assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " - "1024, 2048", + "1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048)); + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); // The number of samples should be lower than SM/(4 * (k + 1) * l) (the // factor 4 being related to the occupancy of 50%). The only supported // value for k is 1, so k + 1 = 2 for now. @@ -141,21 +157,35 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors); + level_count, num_samples, num_lut_vectors, max_shared_memory); break; case 1024: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors); + level_count, num_samples, num_lut_vectors, max_shared_memory); break; case 2048: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors); + level_count, num_samples, num_lut_vectors, max_shared_memory); + break; + case 4096: + host_bootstrap_low_latency>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_lut_vectors, max_shared_memory); + break; + case 8192: + host_bootstrap_low_latency>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, + (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, + level_count, num_samples, num_lut_vectors, max_shared_memory); break; default: break; diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 8a521a86c..64c0cdce6 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -117,7 +117,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer, correction_inverse_fft_inplace(fft); synchronize_threads_in_block(); - // Perform the inverse FFT on the result of the GGSW x GWE and add to the + // Perform the inverse FFT on the result of the GGSW x GLWE and add to the // accumulator NSMFFT_inverse>(fft); synchronize_threads_in_block(); @@ -127,7 +127,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer, __syncthreads(); } -template +template /* * Kernel launched by the low latency version of the * bootstrapping, that uses cooperative groups @@ -142,7 +142,8 @@ __global__ void device_bootstrap_low_latency( Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in, double2 *bootstrapping_key, double2 *mask_join_buffer, double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count) { + uint32_t base_log, uint32_t level_count, char *device_mem, + int device_memory_size_per_block) { grid_group grid = this_grid(); @@ -150,8 +151,14 @@ __global__ void device_bootstrap_low_latency( // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory extern __shared__ char sharedmem[]; + char *selected_memory; + int block_index = + blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; - char *selected_memory = sharedmem; + if constexpr (SMD == FULLSM) + selected_memory = sharedmem; + else + selected_memory = &device_mem[block_index * device_memory_size_per_block]; Torus *accumulator = (Torus *)selected_memory; Torus *accumulator_rotated = @@ -159,6 +166,8 @@ __global__ void device_bootstrap_low_latency( double2 *accumulator_fft = (double2 *)accumulator_rotated + polynomial_size / (sizeof(double2) / sizeof(Torus)); + if constexpr (SMD == PARTIALSM) + accumulator_fft = (double2 *)sharedmem; // The third dimension of the block is used to determine on which ciphertext // this block is operating, in the case of batch bootstraps @@ -251,25 +260,37 @@ __host__ void host_bootstrap_low_latency( uint32_t *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_lut_vectors) { + uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, + uint32_t max_shared_memory) { auto stream = static_cast(v_stream); - int buffer_size_per_gpu = - level_count * num_samples * polynomial_size / 2 * sizeof(double2); + int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count * + polynomial_size / 2 * sizeof(double2); double2 *mask_buffer_fft = (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index); double2 *body_buffer_fft = (double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index); - int bytes_needed = sizeof(Torus) * polynomial_size + // accumulator_rotated - sizeof(Torus) * polynomial_size + // accumulator - sizeof(double2) * polynomial_size / 2; // accumulator fft + // With SM each block corresponds to either the mask or body, no need to + // duplicate data for each + int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator_rotated + sizeof(Torus) * polynomial_size + // accumulator + sizeof(double2) * polynomial_size / 2; // accumulator fft + + int SM_PART = + sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body + + int DM_FULL = SM_FULL; + + int DM_PART = DM_FULL - SM_PART; + + char *d_mem; int thds = polynomial_size / params::opt; - dim3 grid(level_count, 2, num_samples); + dim3 grid(level_count, 2, input_lwe_ciphertext_count); - void *kernel_args[10]; + void *kernel_args[12]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lut_vector; kernel_args[2] = &lwe_array_in; @@ -280,22 +301,55 @@ __host__ void host_bootstrap_low_latency( kernel_args[7] = &polynomial_size; kernel_args[8] = &base_log; kernel_args[9] = &level_count; + kernel_args[10] = &d_mem; - checkCudaErrors(cudaFuncSetAttribute( - device_bootstrap_low_latency, - cudaFuncAttributeMaxDynamicSharedMemorySize, bytes_needed)); - cudaFuncSetCacheConfig(device_bootstrap_low_latency, - cudaFuncCachePreferShared); + if (max_shared_memory < SM_PART) { + kernel_args[11] = &DM_FULL; + checkCudaErrors(cudaGetLastError()); + d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count * + level_count * 2, + *stream, gpu_index); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaLaunchCooperativeKernel( + (void *)device_bootstrap_low_latency, grid, thds, + (void **)kernel_args, 0, *stream)); + } else if (max_shared_memory < SM_FULL) { + kernel_args[11] = &DM_PART; + d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count * + level_count * 2, + *stream, gpu_index); + checkCudaErrors(cudaFuncSetAttribute( + device_bootstrap_low_latency, + cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART)); + cudaFuncSetCacheConfig( + device_bootstrap_low_latency, + cudaFuncCachePreferShared); + checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaLaunchCooperativeKernel( + (void *)device_bootstrap_low_latency, grid, + thds, (void **)kernel_args, SM_PART, *stream)); - checkCudaErrors(cudaLaunchCooperativeKernel( - (void *)device_bootstrap_low_latency, grid, thds, - (void **)kernel_args, bytes_needed, *stream)); + } else { + int DM_NONE = 0; + kernel_args[11] = &DM_NONE; + d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index); + checkCudaErrors(cudaFuncSetAttribute( + device_bootstrap_low_latency, + cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL)); + cudaFuncSetCacheConfig(device_bootstrap_low_latency, + cudaFuncCachePreferShared); + checkCudaErrors(cudaLaunchCooperativeKernel( + (void *)device_bootstrap_low_latency, grid, thds, + (void **)kernel_args, SM_FULL, *stream)); + } + checkCudaErrors(cudaGetLastError()); // Synchronize the streams before copying the result to lwe_array_out at the // right place cudaStreamSynchronize(*stream); cuda_drop_async(mask_buffer_fft, *stream, gpu_index); cuda_drop_async(body_buffer_fft, *stream, gpu_index); + cuda_drop_async(d_mem, *stream, gpu_index); } #endif // LOWLAT_PBS_H diff --git a/src/bootstrap_wop.cu b/src/bootstrap_wop.cu index 0eeba20c7..24d9f6901 100644 --- a/src/bootstrap_wop.cu +++ b/src/bootstrap_wop.cu @@ -118,16 +118,17 @@ void cuda_extract_bits_32( void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, - uint32_t number_of_samples) { + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { assert(("Error (GPU extract bits): base log should be <= 32", base_log_bsk <= 32)); assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1)); assert(("Error (GPU extract bits): lwe_dimension_in should be one of " - "512, 1024, 2048", + "512, 1024, 2048, 4096, 8192", lwe_dimension_in == 512 || lwe_dimension_in == 1024 || - lwe_dimension_in == 2048)); + lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || + lwe_dimension_in == 8192)); // The number of samples should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -151,7 +152,7 @@ void cuda_extract_bits_32( (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples); + number_of_samples, max_shared_memory); break; case 1024: host_extract_bits>( @@ -163,7 +164,7 @@ void cuda_extract_bits_32( (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples); + number_of_samples, max_shared_memory); break; case 2048: host_extract_bits>( @@ -175,7 +176,31 @@ void cuda_extract_bits_32( (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples); + number_of_samples, max_shared_memory); + break; + case 4096: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 8192: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); break; default: break; @@ -190,16 +215,17 @@ void cuda_extract_bits_64( void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, - uint32_t number_of_samples) { + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { assert(("Error (GPU extract bits): base log should be <= 64", base_log_bsk <= 64)); assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", glwe_dimension == 1)); assert(("Error (GPU extract bits): lwe_dimension_in should be one of " - "512, 1024, 2048", + "512, 1024, 2048, 4096, 8192", lwe_dimension_in == 512 || lwe_dimension_in == 1024 || - lwe_dimension_in == 2048)); + lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || + lwe_dimension_in == 8192)); // The number of samples should be lower than the number of streaming // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related // to the occupancy of 50%). The only supported value for k is 1, so @@ -223,7 +249,7 @@ void cuda_extract_bits_64( (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples); + number_of_samples, max_shared_memory); break; case 1024: host_extract_bits>( @@ -235,7 +261,7 @@ void cuda_extract_bits_64( (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples); + number_of_samples, max_shared_memory); break; case 2048: host_extract_bits>( @@ -247,7 +273,31 @@ void cuda_extract_bits_64( (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples); + number_of_samples, max_shared_memory); + break; + case 4096: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 8192: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); break; default: break; diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index 45638d62d..d1ccb7105 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -468,8 +468,8 @@ __host__ void host_extract_bits( uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, - uint32_t number_of_samples) { + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { auto stream = static_cast(v_stream); uint32_t ciphertext_n_bits = sizeof(Torus) * 8; @@ -511,7 +511,7 @@ __host__ void host_extract_bits( v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk, - number_of_samples, 1); + number_of_samples, 1, max_shared_memory); add_sub_and_mul_lwe<<<1, threads, 0, *stream>>>( lwe_array_in_shifted_buffer, lwe_array_in_buffer,