diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu index 5d9f29065..6b0fe8b53 100644 --- a/src/bit_extraction.cu +++ b/src/bit_extraction.cu @@ -15,8 +15,6 @@ void cuda_extract_bits_32( uint32_t max_shared_memory) { assert(("Error (GPU extract bits): base log should be <= 32", base_log_bsk <= 32)); - assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU extract bits): lwe_dimension_in should be one of " "512, 1024, 2048, 4096, 8192", lwe_dimension_in == 512 || lwe_dimension_in == 1024 || @@ -44,8 +42,8 @@ void cuda_extract_bits_32( (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 1024: host_extract_bits>( @@ -56,8 +54,8 @@ void cuda_extract_bits_32( (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 2048: host_extract_bits>( @@ -68,8 +66,8 @@ void cuda_extract_bits_32( (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 4096: host_extract_bits>( @@ -80,8 +78,8 @@ void cuda_extract_bits_32( (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 8192: host_extract_bits>( @@ -92,8 +90,8 @@ void cuda_extract_bits_32( (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; default: break; @@ -157,8 +155,6 @@ void cuda_extract_bits_64( uint32_t max_shared_memory) { assert(("Error (GPU extract bits): base log should be <= 64", base_log_bsk <= 64)); - assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU extract bits): lwe_dimension_in should be one of " "512, 1024, 2048, 4096, 8192", lwe_dimension_in == 512 || lwe_dimension_in == 1024 || @@ -186,8 +182,8 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 1024: host_extract_bits>( @@ -198,8 +194,8 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 2048: host_extract_bits>( @@ -210,8 +206,8 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 4096: host_extract_bits>( @@ -222,8 +218,8 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; case 8192: host_extract_bits>( @@ -234,8 +230,8 @@ void cuda_extract_bits_64( (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_samples, max_shared_memory); break; default: break; diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh index f68dfab3b..4bb0aa9a8 100644 --- a/src/bit_extraction.cuh +++ b/src/bit_extraction.cuh @@ -140,9 +140,9 @@ __host__ void host_extract_bits( Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, - uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, - uint32_t max_shared_memory) { + uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, + uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, + uint32_t number_of_samples, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); @@ -189,8 +189,8 @@ __host__ void host_extract_bits( host_bootstrap_low_latency( v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, - lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk, - number_of_samples, 1, max_shared_memory); + glwe_dimension, lwe_dimension_out, lwe_dimension_in, base_log_bsk, + level_count_bsk, number_of_samples, 1, max_shared_memory); // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption // of 0 if the extracted bit was 0 and 1 in the other case diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 246963314..8651b2456 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -16,8 +16,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( assert(("Error (GPU low latency PBS): base log should be <= 32", base_log <= 32)); - assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " "1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || @@ -39,36 +37,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 1024: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 2048: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 4096: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 8192: host_bootstrap_low_latency>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; default: break; @@ -160,8 +163,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( assert(("Error (GPU low latency PBS): base log should be <= 64", base_log <= 64)); - assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU low latency PBS): polynomial size should be one of 512, " "1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || @@ -183,36 +184,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 1024: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 2048: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 4096: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; case 8192: host_bootstrap_low_latency>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + max_shared_memory); break; default: break; diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 693884cf2..ac24ba474 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -25,11 +25,11 @@ using namespace cooperative_groups; namespace cg = cooperative_groups; template -__device__ void -mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer, - double2 *body_join_buffer, double2 *bootstrapping_key, - int polynomial_size, int level_count, int iteration, - grid_group &grid) { +__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft, + double2 *join_buffer, double2 *bootstrapping_key, + int polynomial_size, uint32_t glwe_dimension, + int level_count, int iteration, + grid_group &grid) { // Switch to the FFT space NSMFFT_direct>(fft); @@ -40,52 +40,50 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer, // this function, so we end up getting the lines of the bootstrapping key // needed to perform the external product in this block (corresponding to // the same decomposition level) + auto bsk_slice = get_ith_mask_kth_block( + bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size, + glwe_dimension, level_count); - auto bsk_mask_slice = - get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y, - blockIdx.x, polynomial_size, 1, level_count); - auto bsk_body_slice = - get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y, - blockIdx.x, polynomial_size, 1, level_count); + // Selects all GLWEs in a particular decomposition level + auto level_join_buffer = + join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2; // Perform the matrix multiplication between the GGSW and the GLWE, // each block operating on a single level for mask and body - auto first_processed_bsk = - (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice; - auto second_processed_bsk = - (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice; - - auto first_processed_acc = - (blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x] - : &body_join_buffer[params::degree / 2 * blockIdx.x]; - auto second_processed_acc = - (blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x] - : &mask_join_buffer[params::degree / 2 * blockIdx.x]; + // The first product is used to initialize level_join_buffer + auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2; + auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2; int tid = threadIdx.x; - - // first product for (int i = 0; i < params::opt / 2; i++) { - first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid]; + buffer_slice[tid] = fft[tid] * bsk_poly[tid]; tid += params::degree / params::opt; } grid.sync(); - tid = threadIdx.x; - // second product - for (int i = 0; i < params::opt / 2; i++) { - second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid]; - tid += params::degree / params::opt; + + // Continues multiplying fft by every polynomial in that particular bsk level + // Each y-block accumulates in a different polynomial at each iteration + for (int j = 1; j < (glwe_dimension + 1); j++) { + int idx = (j + blockIdx.y) % (glwe_dimension + 1); + + auto bsk_poly = bsk_slice + idx * params::degree / 2; + auto buffer_slice = level_join_buffer + idx * params::degree / 2; + + int tid = threadIdx.x; + for (int i = 0; i < params::opt / 2; i++) { + buffer_slice[tid] += fft[tid] * bsk_poly[tid]; + tid += params::degree / params::opt; + } + grid.sync(); } // ----------------------------------------------------------------- + // All blocks are synchronized here; after this sync, level_join_buffer has + // the values needed from every other block - // All blocks are synchronized here; after this sync, *_join_buffer has the - // values needed from every other block - grid.sync(); - - auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer; + auto src_acc = join_buffer + blockIdx.y * params::degree / 2; // copy first product into fft buffer tid = threadIdx.x; @@ -97,7 +95,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer, // accumulate rest of the products into fft buffer for (int l = 1; l < gridDim.x; l++) { - auto cur_src_acc = &src_acc[l * params::degree / 2]; + auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2]; tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { fft[tid] += cur_src_acc[tid]; @@ -121,19 +119,22 @@ template /* * Kernel launched by the low latency version of the * bootstrapping, that uses cooperative groups - * lwe_array_out vector of output lwe s, with length - * (polynomial_size+1)*num_samples lut_vector - vector of look up tables with - * length polynomial_size * num_samples lut_vector_indexes - mapping between - * lwe_array_in and lut_vector lwe_array_in - * - vector of lwe inputs with length (lwe_dimension + 1) * num_samples * + * - lwe_array_out: vector of output lwe s, with length + * (glwe_dimension * polynomial_size+1)*num_samples + * - lut_vector: vector of look up tables with + * length (glwe_dimension+1) * polynomial_size * num_samples + * - lut_vector_indexes: mapping between lwe_array_in and lut_vector + * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) * + * num_samples + * + * Each y-block computes one element of the lwe_array_out. */ __global__ void device_bootstrap_low_latency( Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in, - double2 *bootstrapping_key, double2 *mask_join_buffer, - double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, char *device_mem, - int device_memory_size_per_block) { + double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + char *device_mem, int device_memory_size_per_block) { grid_group grid = this_grid(); @@ -144,6 +145,7 @@ __global__ void device_bootstrap_low_latency( char *selected_memory; int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y; + uint32_t glwe_dimension = gridDim.y - 1; if constexpr (SMD == FULLSM) selected_memory = sharedmem; @@ -163,12 +165,12 @@ __global__ void device_bootstrap_low_latency( // this block is operating, in the case of batch bootstraps auto block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)]; - auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2]; + auto block_lut_vector = + &lut_vector[blockIdx.z * params::degree * (glwe_dimension + 1)]; - auto block_mask_join_buffer = - &mask_join_buffer[blockIdx.z * level_count * params::degree / 2]; - auto block_body_join_buffer = - &body_join_buffer[blockIdx.z * level_count * params::degree / 2]; + auto block_join_buffer = + &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) * + params::degree / 2]; // Since the space is L1 cache is small, we use the same memory location for // the rotated accumulator and the fft accumulator, since we know that the @@ -179,15 +181,10 @@ __global__ void device_bootstrap_low_latency( rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, 2 * params::degree); - if (blockIdx.y == 0) { - divide_by_monomial_negacyclic_inplace( - accumulator, block_lut_vector, b_hat, false, 1); - } else { - divide_by_monomial_negacyclic_inplace( - accumulator, &block_lut_vector[params::degree], b_hat, false, 1); - } + divide_by_monomial_negacyclic_inplace( + accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, + false); for (int i = 0; i < lwe_dimension; i++) { synchronize_threads_in_block(); @@ -200,13 +197,13 @@ __global__ void device_bootstrap_low_latency( // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< Torus, params::opt, params::degree / params::opt>( - accumulator, accumulator_rotated, a_hat, 1); + accumulator, accumulator_rotated, a_hat); // Perform a rounding to increase the accuracy of the // bootstrapped ciphertext round_to_closest_multiple_inplace( - accumulator_rotated, base_log, level_count, 1); + accumulator_rotated, base_log, level_count); synchronize_threads_in_block(); @@ -214,29 +211,33 @@ __global__ void device_bootstrap_low_latency( // decomposition, for the mask and the body (so block 0 will have the // accumulator decomposed at level 0, 1 at 1, etc.) GadgetMatrix gadget_acc(base_log, level_count, - accumulator_rotated, 1); + accumulator_rotated); gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); // We are using the same memory space for accumulator_fft and // accumulator_rotated, so we need to synchronize here to make sure they // don't modify the same memory space at the same time synchronize_threads_in_block(); + // Perform G^-1(ACC) * GGSW -> GLWE - mul_ggsw_glwe(accumulator, accumulator_fft, - block_mask_join_buffer, block_body_join_buffer, - bootstrapping_key, polynomial_size, - level_count, i, grid); + mul_ggsw_glwe( + accumulator, accumulator_fft, block_join_buffer, bootstrapping_key, + polynomial_size, glwe_dimension, level_count, i, grid); + + synchronize_threads_in_block(); } - auto block_lwe_array_out = &lwe_array_out[blockIdx.z * (polynomial_size + 1)]; + auto block_lwe_array_out = + &lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; - if (blockIdx.x == 0 && blockIdx.y == 0) { + if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) { // Perform a sample extract. At this point, all blocks have the result, but // we do the computation at block 0 to avoid waiting for extra blocks, in // case they're not synchronized - sample_extract_mask(block_lwe_array_out, accumulator, 1); - } else if (blockIdx.x == 0 && blockIdx.y == 1) { - block_lwe_array_out[params::degree] = accumulator[0]; + sample_extract_mask(block_lwe_array_out, accumulator); + } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) { + sample_extract_body(block_lwe_array_out, accumulator, 0); } } @@ -248,7 +249,7 @@ template __host__ void host_bootstrap_low_latency( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes, Torus *lwe_array_in, - double2 *bootstrapping_key, uint32_t lwe_dimension, + double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t max_shared_memory) { @@ -258,10 +259,8 @@ __host__ void host_bootstrap_low_latency( int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count * polynomial_size / 2 * sizeof(double2); - double2 *mask_buffer_fft = - (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index); - double2 *body_buffer_fft = - (double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index); + double2 *buffer_fft = (double2 *)cuda_malloc_async( + (glwe_dimension + 1) * buffer_size_per_gpu, stream, gpu_index); // With SM each block corresponds to either the mask or body, no need to // duplicate data for each @@ -279,35 +278,34 @@ __host__ void host_bootstrap_low_latency( char *d_mem; int thds = polynomial_size / params::opt; - dim3 grid(level_count, 2, input_lwe_ciphertext_count); + dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); - void *kernel_args[12]; + void *kernel_args[11]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lut_vector; kernel_args[2] = &lwe_array_in; kernel_args[3] = &bootstrapping_key; - kernel_args[4] = &mask_buffer_fft; - kernel_args[5] = &body_buffer_fft; - kernel_args[6] = &lwe_dimension; - kernel_args[7] = &polynomial_size; - kernel_args[8] = &base_log; - kernel_args[9] = &level_count; - kernel_args[10] = &d_mem; + kernel_args[4] = &buffer_fft; + kernel_args[5] = &lwe_dimension; + kernel_args[6] = &polynomial_size; + kernel_args[7] = &base_log; + kernel_args[8] = &level_count; + kernel_args[9] = &d_mem; if (max_shared_memory < SM_PART) { - kernel_args[11] = &DM_FULL; + kernel_args[10] = &DM_FULL; check_cuda_error(cudaGetLastError()); d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count * - level_count * 2, + level_count * (glwe_dimension + 1), stream, gpu_index); check_cuda_error(cudaGetLastError()); check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_bootstrap_low_latency, grid, thds, (void **)kernel_args, 0, *stream)); } else if (max_shared_memory < SM_FULL) { - kernel_args[11] = &DM_PART; + kernel_args[10] = &DM_PART; d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count * - level_count * 2, + level_count * (glwe_dimension + 1), stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( device_bootstrap_low_latency, @@ -322,7 +320,7 @@ __host__ void host_bootstrap_low_latency( } else { int DM_NONE = 0; - kernel_args[11] = &DM_NONE; + kernel_args[10] = &DM_NONE; d_mem = (char *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( device_bootstrap_low_latency, @@ -337,8 +335,7 @@ __host__ void host_bootstrap_low_latency( check_cuda_error(cudaGetLastError()); // Synchronize the streams before copying the result to lwe_array_out at the // right place - cuda_drop_async(mask_buffer_fft, stream, gpu_index); - cuda_drop_async(body_buffer_fft, stream, gpu_index); + cuda_drop_async(buffer_fft, stream, gpu_index); cuda_drop_async(d_mem, stream, gpu_index); } diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu index df3622ec7..417084a9c 100644 --- a/src/circuit_bootstrap.cu +++ b/src/circuit_bootstrap.cu @@ -13,8 +13,6 @@ void cuda_circuit_bootstrap_32( uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_samples, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " "512, 1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || @@ -137,8 +135,6 @@ void cuda_circuit_bootstrap_64( uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_samples, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " "512, 1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || diff --git a/src/crypto/gadget.cuh b/src/crypto/gadget.cuh index 863288884..51fe9d2eb 100644 --- a/src/crypto/gadget.cuh +++ b/src/crypto/gadget.cuh @@ -12,6 +12,7 @@ * arbitrary amount of levels by using decompose_and_compress_level(). * * This class always decomposes the entire set of num_poly polynomials. + * By default, it works on a single polynomial. */ #pragma once template class GadgetMatrix { @@ -28,7 +29,7 @@ private: public: __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state, - uint32_t num_poly) + uint32_t num_poly = 1) : base_log(base_log), level_count(level_count), num_poly(num_poly), state(state) { diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh index 1076bf755..dc86be2e5 100644 --- a/src/polynomial/functions.cuh +++ b/src/polynomial/functions.cuh @@ -61,11 +61,13 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) { * Performs acc = acc * (X^ä + 1) if zeroAcc = false * Performs acc = 0 if zeroAcc * takes single buffer and calculates inplace. + * + * By default, it works on a single polynomial. */ template __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input, uint32_t j, bool zeroAcc, - uint32_t num_poly) { + uint32_t num_poly = 1) { constexpr int degree = block_size * elems_per_thread; for (int z = 0; z < num_poly; z++) { T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree); @@ -105,10 +107,12 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input, * * Performs result_acc = acc * (X^ä - 1) - acc * takes single buffer as input and returns a single rotated buffer + * + * By default, it works on a single polynomial. */ template __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial( - T *acc, T *result_acc, uint32_t j, uint32_t num_poly) { + T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) { constexpr int degree = block_size * elems_per_thread; for (int z = 0; z < num_poly; z++) { T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree); @@ -138,11 +142,13 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial( /* * Receives num_poly concatenated polynomials of type T. For each performs a * rounding to increase accuracy of the PBS. Calculates inplace. + * + * By default, it works on a single polynomial. */ template __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log, int level_count, - uint32_t num_poly) { + uint32_t num_poly = 1) { constexpr int degree = block_size * elems_per_thread; for (int z = 0; z < num_poly; z++) { T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree); @@ -192,20 +198,21 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) { } } -// Extracts the body of a GLWE with dimension glwe_dimension +// Extracts the body of a GLWE. +// k is the offset to find the body element / polynomial in the lwe_array_out / +// accumulator template __device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator, - uint32_t glwe_dimension) { + uint32_t k) { // Set first coefficient of the accumulator as the body of the LWE sample - lwe_array_out[glwe_dimension * params::degree] = - accumulator[glwe_dimension * params::degree]; + lwe_array_out[k * params::degree] = accumulator[k * params::degree]; } -// Extracts the mask of a GLWE with dimension glwe_dimension +// Extracts the mask from num_poly polynomials individually template __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, - uint32_t glwe_dimension) { - for (int z = 0; z < glwe_dimension; z++) { + uint32_t num_poly = 1) { + for (int z = 0; z < num_poly; z++) { Torus *lwe_array_out_slice = (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree); Torus *accumulator_slice = diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu index 9f148f9fd..6c6a9fb2d 100644 --- a/src/vertical_packing.cu +++ b/src/vertical_packing.cu @@ -17,8 +17,6 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, polynomial_size == 2048 || polynomial_size == 4096 || polynomial_size == 8192)); // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " "be >= 1 ", r >= 1)); @@ -99,8 +97,6 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, polynomial_size == 2048 || polynomial_size == 4096 || polynomial_size == 8192)); // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " "be >= 1 ", r >= 1)); diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu index c144a3654..4a71beadb 100644 --- a/src/wop_bootstrap.cu +++ b/src/wop_bootstrap.cu @@ -35,8 +35,6 @@ void cuda_circuit_bootstrap_vertical_packing_64( uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " "512, 1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || @@ -149,8 +147,6 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, uint32_t number_of_bits_of_message_including_padding, uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, uint32_t max_shared_memory) { - assert(("Error (GPU WOP PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU WOP PBS): polynomial_size should be one of " "512, 1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index 9a8bb9acb..1760cad22 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -174,8 +174,8 @@ __host__ void host_wop_pbs( lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer, lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk, number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_inputs, max_shared_memory); + glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, + level_count_ksk, number_of_inputs, max_shared_memory); check_cuda_error(cudaGetLastError()); cuda_drop_async(lut_pbs, stream, gpu_index); cuda_drop_async(lut_vector_indexes, stream, gpu_index);