diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu index 612f5c33b..ead137428 100644 --- a/src/bootstrap_amortized.cu +++ b/src/bootstrap_amortized.cu @@ -13,8 +13,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( assert( ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32)); - assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU amortized PBS): polynomial size should be one of 512, " "1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || @@ -26,36 +24,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( host_bootstrap_amortized>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 1024: host_bootstrap_amortized>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 2048: host_bootstrap_amortized>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 4096: host_bootstrap_amortized>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 8192: host_bootstrap_amortized>( v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; default: break; @@ -136,8 +139,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( assert( ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64)); - assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1", - glwe_dimension == 1)); assert(("Error (GPU amortized PBS): polynomial size should be one of 512, " "1024, 2048, 4096, 8192", polynomial_size == 512 || polynomial_size == 1024 || @@ -149,36 +150,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 1024: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 2048: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 4096: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; case 8192: host_bootstrap_amortized>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector, (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, - (double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log, - level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory); + (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, num_samples, num_lut_vectors, + lwe_idx, max_shared_memory); break; default: break; diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index c937f3c32..4e13dc1c4 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -53,8 +53,8 @@ template __global__ void device_bootstrap_amortized( Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key, char *device_mem, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t lwe_idx, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t lwe_idx, size_t device_memory_size_per_sample) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -69,26 +69,22 @@ __global__ void device_bootstrap_amortized( // For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only // one mask polynomial and 1 body to handle. - Torus *accumulator_mask = (Torus *)selected_memory; - Torus *accumulator_body = - (Torus *)accumulator_mask + (ptrdiff_t)polynomial_size; - Torus *accumulator_mask_rotated = - (Torus *)accumulator_body + (ptrdiff_t)polynomial_size; - Torus *accumulator_body_rotated = - (Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size; - double2 *mask_res_fft = (double2 *)accumulator_body_rotated + - polynomial_size / (sizeof(double2) / sizeof(Torus)); - double2 *body_res_fft = - (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2; + Torus *accumulator = (Torus *)selected_memory; + Torus *accumulator_rotated = + (Torus *)accumulator + + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size); + double2 *res_fft = + (double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size / + (sizeof(double2) / sizeof(Torus)); double2 *accumulator_fft = (double2 *)sharedmem; if constexpr (SMD != PARTIALSM) - accumulator_fft = - (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2); + accumulator_fft = (double2 *)res_fft + + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2); auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * - 2]; + (glwe_dimension + 1)]; // Put "b", the body, in [0, 2N[ Torus b_hat = 0; @@ -97,11 +93,7 @@ __global__ void device_bootstrap_amortized( divide_by_monomial_negacyclic_inplace( - accumulator_mask, block_lut_vector, b_hat, false); - - divide_by_monomial_negacyclic_inplace( - accumulator_body, &block_lut_vector[params::degree], b_hat, false); + accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1); // Loop over all the mask elements of the sample to accumulate // (X^a_i-1) multiplication, decomposition of the resulting polynomial @@ -118,11 +110,7 @@ __global__ void device_bootstrap_amortized( // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< Torus, params::opt, params::degree / params::opt>( - accumulator_mask, accumulator_mask_rotated, a_hat); - - multiply_by_monomial_negacyclic_and_sub_polynomial< - Torus, params::opt, params::degree / params::opt>( - accumulator_body, accumulator_body_rotated, a_hat); + accumulator, accumulator_rotated, a_hat, glwe_dimension + 1); synchronize_threads_in_block(); @@ -130,153 +118,122 @@ __global__ void device_bootstrap_amortized( // bootstrapped ciphertext round_to_closest_multiple_inplace( - accumulator_mask_rotated, base_log, level_count); + accumulator_rotated, base_log, level_count, glwe_dimension + 1); - round_to_closest_multiple_inplace( - accumulator_body_rotated, base_log, level_count); // Initialize the polynomial multiplication via FFT arrays // The polynomial multiplications happens at the block level // and each thread handles two or more coefficients int pos = threadIdx.x; - for (int j = 0; j < params::opt / 2; j++) { - mask_res_fft[pos].x = 0; - mask_res_fft[pos].y = 0; - body_res_fft[pos].x = 0; - body_res_fft[pos].y = 0; + for (int j = 0; j < (glwe_dimension + 1) * params::opt / 2; j++) { + res_fft[pos].x = 0; + res_fft[pos].y = 0; pos += params::degree / params::opt; } - GadgetMatrix gadget_mask(base_log, level_count, - accumulator_mask_rotated); - GadgetMatrix gadget_body(base_log, level_count, - accumulator_body_rotated); - + GadgetMatrix gadget(base_log, level_count, + accumulator_rotated, glwe_dimension + 1); // Now that the rotation is done, decompose the resulting polynomial // coefficients so as to multiply each decomposed level with the // corresponding part of the bootstrapping key for (int level = level_count - 1; level >= 0; level--) { + gadget.decompose_and_compress_next(accumulator_fft); + for (int i = 0; i < (glwe_dimension + 1); i++) { + auto accumulator_fft_slice = accumulator_fft + i * params::degree / 2; - gadget_mask.decompose_and_compress_next(accumulator_fft); - // Switch to the FFT space - NSMFFT_direct>(accumulator_fft); + // Switch to the FFT space + NSMFFT_direct>(accumulator_fft_slice); + synchronize_threads_in_block(); + + // Get the bootstrapping key piece necessary for the multiplication + // It is already in the Fourier domain + auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i, + level, polynomial_size, + glwe_dimension, level_count); + synchronize_threads_in_block(); + + // Perform the coefficient-wise product with the two pieces of + // bootstrapping key + for (int j = 0; j < (glwe_dimension + 1); j++) { + auto bsk_poly = bsk_slice + j * params::degree / 2; + auto res_fft_poly = res_fft + j * params::degree / 2; + polynomial_product_accumulate_in_fourier_domain( + res_fft_poly, accumulator_fft_slice, bsk_poly); + } + } synchronize_threads_in_block(); - - // Get the bootstrapping key piece necessary for the multiplication - // It is already in the Fourier domain - auto bsk_mask_slice = - get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level, - polynomial_size, 1, level_count); - auto bsk_body_slice = - get_ith_body_kth_block(bootstrapping_key, iteration, 0, level, - polynomial_size, 1, level_count); - - synchronize_threads_in_block(); - - // Perform the coefficient-wise product with the two pieces of - // bootstrapping key - polynomial_product_accumulate_in_fourier_domain( - mask_res_fft, accumulator_fft, bsk_mask_slice); - polynomial_product_accumulate_in_fourier_domain( - body_res_fft, accumulator_fft, bsk_body_slice); - - synchronize_threads_in_block(); - gadget_body.decompose_and_compress_next(accumulator_fft); - - // Now handle the polynomial multiplication for the body - // in the same way - - NSMFFT_direct>(accumulator_fft); - synchronize_threads_in_block(); - - auto bsk_mask_slice_2 = - get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level, - polynomial_size, 1, level_count); - auto bsk_body_slice_2 = - get_ith_body_kth_block(bootstrapping_key, iteration, 1, level, - polynomial_size, 1, level_count); - - synchronize_threads_in_block(); - - polynomial_product_accumulate_in_fourier_domain( - mask_res_fft, accumulator_fft, bsk_mask_slice_2); - polynomial_product_accumulate_in_fourier_domain( - body_res_fft, accumulator_fft, bsk_body_slice_2); } // Come back to the coefficient representation if constexpr (SMD == FULLSM || SMD == NOSM) { synchronize_threads_in_block(); - NSMFFT_inverse>(mask_res_fft); - NSMFFT_inverse>(body_res_fft); - + for (int i = 0; i < (glwe_dimension + 1); i++) { + auto res_fft_slice = res_fft + i * params::degree / 2; + NSMFFT_inverse>(res_fft_slice); + } synchronize_threads_in_block(); - add_to_torus(mask_res_fft, accumulator_mask); - add_to_torus(body_res_fft, accumulator_body); + for (int i = 0; i < (glwe_dimension + 1); i++) { + auto accumulator_slice = accumulator + i * params::degree; + auto res_fft_slice = res_fft + i * params::degree / 2; + add_to_torus(res_fft_slice, accumulator_slice); + } synchronize_threads_in_block(); - } else { - int tid = threadIdx.x; #pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - accumulator_fft[tid] = mask_res_fft[tid]; - tid = tid + params::degree / params::opt; + for (int i = 0; i < (glwe_dimension + 1); i++) { + auto accumulator_slice = accumulator + i * params::degree; + auto res_fft_slice = res_fft + i * params::degree / 2; + int tid = threadIdx.x; + for (int i = 0; i < params::opt / 2; i++) { + accumulator_fft[tid] = res_fft_slice[tid]; + tid = tid + params::degree / params::opt; + } + synchronize_threads_in_block(); + + NSMFFT_inverse>(accumulator_fft); + synchronize_threads_in_block(); + + add_to_torus(accumulator_fft, accumulator_slice); } synchronize_threads_in_block(); - - NSMFFT_inverse>(accumulator_fft); - synchronize_threads_in_block(); - - add_to_torus(accumulator_fft, accumulator_mask); - synchronize_threads_in_block(); - - tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - accumulator_fft[tid] = body_res_fft[tid]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - - NSMFFT_inverse>(accumulator_fft); - synchronize_threads_in_block(); - - add_to_torus(accumulator_fft, accumulator_body); - synchronize_threads_in_block(); } } - auto block_lwe_array_out = &lwe_array_out[blockIdx.x * (polynomial_size + 1)]; + auto block_lwe_array_out = + &lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)]; // The blind rotation for this block is over // Now we can perform the sample extraction: for the body it's just // the resulting constant coefficient of the accumulator // For the mask it's more complicated - sample_extract_mask(block_lwe_array_out, accumulator_mask); - sample_extract_body(block_lwe_array_out, accumulator_body); + sample_extract_mask(block_lwe_array_out, accumulator, + glwe_dimension); + sample_extract_body(block_lwe_array_out, accumulator, + glwe_dimension); } template __host__ void host_bootstrap_amortized( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes, Torus *lwe_array_in, - double2 *bootstrapping_key, uint32_t input_lwe_dimension, + double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); - int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask - sizeof(Torus) * polynomial_size + // accumulator body - sizeof(Torus) * polynomial_size + // accumulator mask rotated - sizeof(Torus) * polynomial_size + // accumulator body rotated - sizeof(double2) * polynomial_size / 2 + // accumulator fft mask - sizeof(double2) * polynomial_size / 2 + // accumulator fft body - sizeof(double2) * polynomial_size / 2; // calculate buffer fft + int SM_FULL = + sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator + sizeof(Torus) * polynomial_size * + (glwe_dimension + 1) + // accumulator rotated + sizeof(double2) * polynomial_size / 2 * + (glwe_dimension + 1) + // accumulator fft + sizeof(double2) * polynomial_size / 2 * + (glwe_dimension + 1); // calculate buffer fft - int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft + int SM_PART = sizeof(double2) * polynomial_size / 2 * + (glwe_dimension + 1); // calculate buffer fft int DM_PART = SM_FULL - SM_PART; @@ -304,8 +261,8 @@ __host__ void host_bootstrap_amortized( stream, gpu_index); device_bootstrap_amortized<<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, - bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, - base_log, level_count, lwe_idx, DM_FULL); + bootstrapping_key, d_mem, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, lwe_idx, DM_FULL); } else if (max_shared_memory < SM_FULL) { cudaFuncSetAttribute(device_bootstrap_amortized, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART); @@ -316,8 +273,8 @@ __host__ void host_bootstrap_amortized( device_bootstrap_amortized <<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, - bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, - base_log, level_count, lwe_idx, DM_PART); + bootstrapping_key, d_mem, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, lwe_idx, DM_PART); } else { // For devices with compute capability 7.x a single thread block can // address the full capacity of shared memory. Shared memory on the @@ -335,11 +292,10 @@ __host__ void host_bootstrap_amortized( device_bootstrap_amortized <<>>( lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in, - bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size, - base_log, level_count, lwe_idx, 0); + bootstrapping_key, d_mem, glwe_dimension, lwe_dimension, + polynomial_size, base_log, level_count, lwe_idx, 0); } check_cuda_error(cudaGetLastError()); - cuda_drop_async(d_mem, stream, gpu_index); } diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 877950b1f..693884cf2 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -182,11 +182,11 @@ __global__ void device_bootstrap_low_latency( if (blockIdx.y == 0) { divide_by_monomial_negacyclic_inplace( - accumulator, block_lut_vector, b_hat, false); + accumulator, block_lut_vector, b_hat, false, 1); } else { divide_by_monomial_negacyclic_inplace( - accumulator, &block_lut_vector[params::degree], b_hat, false); + accumulator, &block_lut_vector[params::degree], b_hat, false, 1); } for (int i = 0; i < lwe_dimension; i++) { @@ -200,13 +200,13 @@ __global__ void device_bootstrap_low_latency( // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< Torus, params::opt, params::degree / params::opt>( - accumulator, accumulator_rotated, a_hat); + accumulator, accumulator_rotated, a_hat, 1); // Perform a rounding to increase the accuracy of the // bootstrapped ciphertext round_to_closest_multiple_inplace( - accumulator_rotated, base_log, level_count); + accumulator_rotated, base_log, level_count, 1); synchronize_threads_in_block(); @@ -214,7 +214,7 @@ __global__ void device_bootstrap_low_latency( // decomposition, for the mask and the body (so block 0 will have the // accumulator decomposed at level 0, 1 at 1, etc.) GadgetMatrix gadget_acc(base_log, level_count, - accumulator_rotated); + accumulator_rotated, 1); gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x); // We are using the same memory space for accumulator_fft and @@ -234,9 +234,9 @@ __global__ void device_bootstrap_low_latency( // Perform a sample extract. At this point, all blocks have the result, but // we do the computation at block 0 to avoid waiting for extra blocks, in // case they're not synchronized - sample_extract_mask(block_lwe_array_out, accumulator); + sample_extract_mask(block_lwe_array_out, accumulator, 1); } else if (blockIdx.x == 0 && blockIdx.y == 1) { - sample_extract_body(block_lwe_array_out, accumulator); + block_lwe_array_out[params::degree] = accumulator[0]; } } diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh index dbca600a2..9bb713000 100644 --- a/src/circuit_bootstrap.cuh +++ b/src/circuit_bootstrap.cuh @@ -144,8 +144,8 @@ __host__ void host_circuit_bootstrap( host_bootstrap_amortized( v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector, lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, - lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count, - level_cbs, 0, max_shared_memory); + glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk, + pbs_count, level_cbs, 0, max_shared_memory); dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1); dim3 copy_block(params::degree / params::opt, 1, 1); diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index 71e40f7ce..4ed497e4b 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -15,6 +15,7 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, level_count; } +//////////////////////////////////////////////// template __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size, @@ -35,8 +36,9 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level, level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) + k * polynomial_size / 2 * (glwe_dimension + 1) + - polynomial_size / 2]; + glwe_dimension * polynomial_size / 2]; } +//////////////////////////////////////////////// template void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, diff --git a/src/crypto/gadget.cuh b/src/crypto/gadget.cuh index cd0918239..863288884 100644 --- a/src/crypto/gadget.cuh +++ b/src/crypto/gadget.cuh @@ -4,6 +4,15 @@ #include "polynomial/polynomial.cuh" #include +/** + * GadgetMatrix implements the iterator design pattern to decompose a set of + * num_poly consecutive polynomials with degree params::degree. A total of + * level_count levels is expected and each call to decompose_and_compress_next() + * writes to the result the next level. It is also possible to advance an + * arbitrary amount of levels by using decompose_and_compress_level(). + * + * This class always decomposes the entire set of num_poly polynomials. + */ #pragma once template class GadgetMatrix { private: @@ -11,19 +20,22 @@ private: uint32_t base_log; uint32_t mask; uint32_t halfbg; + uint32_t num_poly; T offset; int current_level; T mask_mod_b; T *state; public: - __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state) - : base_log(base_log), level_count(level_count), state(state) { + __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state, + uint32_t num_poly) + : base_log(base_log), level_count(level_count), num_poly(num_poly), + state(state) { mask_mod_b = (1ll << base_log) - 1ll; current_level = level_count; int tid = threadIdx.x; - for (int i = 0; i < params::opt; i++) { + for (int i = 0; i < num_poly * params::opt; i++) { state[tid] >>= (sizeof(T) * 8 - base_log * level_count); tid += params::degree / params::opt; } @@ -31,26 +43,31 @@ public: } __device__ void decompose_and_compress_next(double2 *result) { - int tid = threadIdx.x; current_level -= 1; - for (int i = 0; i < params::opt / 2; i++) { - T res_re = state[tid] & mask_mod_b; - T res_im = state[tid + params::degree / 2] & mask_mod_b; - state[tid] >>= base_log; - state[tid + params::degree / 2] >>= base_log; - T carry_re = ((res_re - 1ll) | state[tid]) & res_re; - T carry_im = ((res_im - 1ll) | state[tid + params::degree / 2]) & res_im; - carry_re >>= (base_log - 1); - carry_im >>= (base_log - 1); - state[tid] += carry_re; - state[tid + params::degree / 2] += carry_im; - res_re -= carry_re << base_log; - res_im -= carry_im << base_log; + for (int j = 0; j < num_poly; j++) { + int tid = threadIdx.x; + auto state_slice = state + j * params::degree; + auto result_slice = result + j * params::degree / 2; + for (int i = 0; i < params::opt / 2; i++) { + T res_re = state_slice[tid] & mask_mod_b; + T res_im = state_slice[tid + params::degree / 2] & mask_mod_b; + state_slice[tid] >>= base_log; + state_slice[tid + params::degree / 2] >>= base_log; + T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re; + T carry_im = + ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im; + carry_re >>= (base_log - 1); + carry_im >>= (base_log - 1); + state_slice[tid] += carry_re; + state_slice[tid + params::degree / 2] += carry_im; + res_re -= carry_re << base_log; + res_im -= carry_im << base_log; - result[tid].x = (int32_t)res_re; - result[tid].y = (int32_t)res_im; + result_slice[tid].x = (int32_t)res_re; + result_slice[tid].y = (int32_t)res_im; - tid += params::degree / params::opt; + tid += params::degree / params::opt; + } } synchronize_threads_in_block(); } diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh index 6cc1e49aa..1076bf755 100644 --- a/src/polynomial/functions.cuh +++ b/src/polynomial/functions.cuh @@ -56,36 +56,78 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) { } /* + * Receives num_poly concatenated polynomials of type T. For each: + * * Performs acc = acc * (X^ä + 1) if zeroAcc = false * Performs acc = 0 if zeroAcc * takes single buffer and calculates inplace. */ template __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input, - uint32_t j, - bool zeroAcc) { - int tid = threadIdx.x; + uint32_t j, bool zeroAcc, + uint32_t num_poly) { constexpr int degree = block_size * elems_per_thread; - if (zeroAcc) { - for (int i = 0; i < elems_per_thread; i++) { - accumulator[tid] = 0; - tid += block_size; + for (int z = 0; z < num_poly; z++) { + T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree); + T *input_slice = (T *)input + (ptrdiff_t)(z * degree); + + int tid = threadIdx.x; + if (zeroAcc) { + for (int i = 0; i < elems_per_thread; i++) { + accumulator_slice[tid] = 0; + tid += block_size; + } + } else { + tid = threadIdx.x; + for (int i = 0; i < elems_per_thread; i++) { + if (j < degree) { + if (tid < degree - j) { + accumulator_slice[tid] = input_slice[tid + j]; + } else { + accumulator_slice[tid] = -input_slice[tid - degree + j]; + } + } else { + uint32_t jj = j - degree; + if (tid < degree - jj) { + accumulator_slice[tid] = -input_slice[tid + jj]; + } else { + accumulator_slice[tid] = input_slice[tid - degree + jj]; + } + } + tid += block_size; + } } - } else { - tid = threadIdx.x; + } +} + +/* + * Receives num_poly concatenated polynomials of type T. For each: + * + * Performs result_acc = acc * (X^ä - 1) - acc + * takes single buffer as input and returns a single rotated buffer + */ +template +__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial( + T *acc, T *result_acc, uint32_t j, uint32_t num_poly) { + constexpr int degree = block_size * elems_per_thread; + for (int z = 0; z < num_poly; z++) { + T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree); + T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree); + int tid = threadIdx.x; for (int i = 0; i < elems_per_thread; i++) { if (j < degree) { - if (tid < degree - j) { - accumulator[tid] = input[tid + j]; + if (tid < j) { + result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid]; } else { - accumulator[tid] = -input[tid - degree + j]; + result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid]; } } else { uint32_t jj = j - degree; - if (tid < degree - jj) { - accumulator[tid] = -input[tid + jj]; + if (tid < jj) { + result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid]; + } else { - accumulator[tid] = input[tid - degree + jj]; + result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid]; } } tid += block_size; @@ -94,54 +136,28 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input, } /* - * Performs result_acc = acc * (X^ä - 1) - acc - * takes single buffer as input returns single rotated buffer - */ -template -__device__ void -multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc, - uint32_t j) { - int tid = threadIdx.x; - constexpr int degree = block_size * elems_per_thread; - for (int i = 0; i < elems_per_thread; i++) { - if (j < degree) { - if (tid < j) { - result_acc[tid] = -acc[tid - j + degree] - acc[tid]; - } else { - result_acc[tid] = acc[tid - j] - acc[tid]; - } - } else { - uint32_t jj = j - degree; - if (tid < jj) { - result_acc[tid] = acc[tid - jj + degree] - acc[tid]; - - } else { - result_acc[tid] = -acc[tid - jj] - acc[tid]; - } - } - tid += block_size; - } -} - -/* - * performs a rounding to increase accuracy of the PBS - * calculates inplace. + * Receives num_poly concatenated polynomials of type T. For each performs a + * rounding to increase accuracy of the PBS. Calculates inplace. */ template __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log, - int level_count) { - int tid = threadIdx.x; - for (int i = 0; i < elems_per_thread; i++) { - - T x_acc = rotated_acc[tid]; - T shift = sizeof(T) * 8 - level_count * base_log; - T mask = 1ll << (shift - 1); - T b_acc = (x_acc & mask) >> (shift - 1); - T res_acc = x_acc >> shift; - res_acc += b_acc; - res_acc <<= shift; - rotated_acc[tid] = res_acc; - tid = tid + block_size; + int level_count, + uint32_t num_poly) { + constexpr int degree = block_size * elems_per_thread; + for (int z = 0; z < num_poly; z++) { + T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree); + int tid = threadIdx.x; + for (int i = 0; i < elems_per_thread; i++) { + T x_acc = rotated_acc_slice[tid]; + T shift = sizeof(T) * 8 - level_count * base_log; + T mask = 1ll << (shift - 1); + T b_acc = (x_acc & mask) >> (shift - 1); + T res_acc = x_acc >> shift; + res_acc += b_acc; + res_acc <<= shift; + rotated_acc_slice[tid] = res_acc; + tid = tid + block_size; + } } } @@ -176,80 +192,85 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) { } } +// Extracts the body of a GLWE with dimension glwe_dimension template -__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator) { +__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator, + uint32_t glwe_dimension) { // Set first coefficient of the accumulator as the body of the LWE sample - lwe_array_out[params::degree] = accumulator[0]; + lwe_array_out[glwe_dimension * params::degree] = + accumulator[glwe_dimension * params::degree]; } +// Extracts the mask of a GLWE with dimension glwe_dimension template -__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator) { - // Set ACC = -ACC - // accumulator.negate_inplace(); +__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator, + uint32_t glwe_dimension) { + for (int z = 0; z < glwe_dimension; z++) { + Torus *lwe_array_out_slice = + (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree); + Torus *accumulator_slice = + (Torus *)accumulator + (ptrdiff_t)(z * params::degree); - int tid = threadIdx.x; + // Set ACC = -ACC + int tid = threadIdx.x; #pragma unroll - for (int i = 0; i < params::opt; i++) { - accumulator[tid] = -accumulator[tid]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - - // Reverse the accumulator - // accumulator.reverse_inplace(); - - tid = threadIdx.x; - Torus result[params::opt]; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - result[i] = accumulator[params::degree - tid - 1]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - accumulator[tid] = result[i]; - tid = tid + params::degree / params::opt; - } - synchronize_threads_in_block(); - - // Perform ACC * X - // accumulator.multiply_by_monomial_negacyclic_inplace(1); - - tid = threadIdx.x; - result[params::opt]; - for (int i = 0; i < params::opt; i++) { - if (1 < params::degree) { - if (tid < 1) - result[i] = -accumulator[tid - 1 + params::degree]; - else - result[i] = accumulator[tid - 1]; - } else { - uint32_t jj = 1 - (uint32_t)params::degree; - if (tid < jj) - result[i] = accumulator[tid - jj + params::degree]; - else - result[i] = -accumulator[tid - jj]; + for (int i = 0; i < params::opt; i++) { + accumulator_slice[tid] = -accumulator_slice[tid]; + tid = tid + params::degree / params::opt; } - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); - tid = threadIdx.x; - for (int i = 0; i < params::opt; i++) { - accumulator[tid] = result[i]; - tid += params::degree / params::opt; - } - synchronize_threads_in_block(); + synchronize_threads_in_block(); - // Copy to the mask of the LWE sample - // accumulator.copy_into(lwe_array_out); - - tid = threadIdx.x; + // Reverse the accumulator + tid = threadIdx.x; + Torus result[params::opt]; #pragma unroll - for (int i = 0; i < params::opt; i++) { - lwe_array_out[tid] = accumulator[tid]; - tid = tid + params::degree / params::opt; + for (int i = 0; i < params::opt; i++) { + result[i] = accumulator_slice[params::degree - tid - 1]; + tid = tid + params::degree / params::opt; + } + synchronize_threads_in_block(); + tid = threadIdx.x; +#pragma unroll + for (int i = 0; i < params::opt; i++) { + accumulator_slice[tid] = result[i]; + tid = tid + params::degree / params::opt; + } + synchronize_threads_in_block(); + + // Perform ACC * X + // (equivalent to multiply_by_monomial_negacyclic_inplace(1)) + tid = threadIdx.x; + result[params::opt]; + for (int i = 0; i < params::opt; i++) { + if (1 < params::degree) { + if (tid < 1) + result[i] = -accumulator_slice[tid - 1 + params::degree]; + else + result[i] = accumulator_slice[tid - 1]; + } else { + uint32_t jj = 1 - (uint32_t)params::degree; + if (tid < jj) + result[i] = accumulator_slice[tid - jj + params::degree]; + else + result[i] = -accumulator_slice[tid - jj]; + } + tid += params::degree / params::opt; + } + synchronize_threads_in_block(); + tid = threadIdx.x; + for (int i = 0; i < params::opt; i++) { + accumulator_slice[tid] = result[i]; + tid += params::degree / params::opt; + } + synchronize_threads_in_block(); + + // Copy to the mask of the LWE sample + tid = threadIdx.x; +#pragma unroll + for (int i = 0; i < params::opt; i++) { + lwe_array_out_slice[tid] = accumulator_slice[tid]; + tid = tid + params::degree / params::opt; + } } } diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh index 917273ecd..bb9b5a22b 100644 --- a/src/vertical_packing.cuh +++ b/src/vertical_packing.cuh @@ -101,8 +101,10 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in, pos += params::degree / params::opt; } - GadgetMatrix gadget_mask(base_log, level_count, glwe_sub_mask); - GadgetMatrix gadget_body(base_log, level_count, glwe_sub_body); + GadgetMatrix gadget_mask(base_log, level_count, glwe_sub_mask, + 1); + GadgetMatrix gadget_body(base_log, level_count, glwe_sub_body, + 1); // Subtract each glwe operand, decompose the resulting // polynomial coefficients to multiply each decomposed level // with the corresponding part of the LUT @@ -420,12 +422,12 @@ __global__ void device_blind_rotation_and_sample_extraction( // Body divide_by_monomial_negacyclic_inplace( - accumulator_c1, accumulator_c0, (1 << monomial_degree), false); + accumulator_c1, accumulator_c0, (1 << monomial_degree), false, 1); // Mask divide_by_monomial_negacyclic_inplace( accumulator_c1 + polynomial_size, accumulator_c0 + polynomial_size, - (1 << monomial_degree), false); + (1 << monomial_degree), false, 1); monomial_degree += 1; @@ -445,9 +447,8 @@ __global__ void device_blind_rotation_and_sample_extraction( // Now we can perform the sample extraction: for the body it's just // the resulting constant coefficient of the accumulator // For the mask it's more complicated - sample_extract_mask(block_lwe_out, accumulator_c0); - sample_extract_body(block_lwe_out, - accumulator_c0 + polynomial_size); + sample_extract_mask(block_lwe_out, accumulator_c0, 1); + sample_extract_body(block_lwe_out, accumulator_c0, 1); } template