From eb8aeb5a01b9d8a3c77cc84107f07f9e43137fdf Mon Sep 17 00:00:00 2001 From: Pedro Alves Date: Wed, 22 Feb 2023 13:03:36 -0300 Subject: [PATCH] refactor(cuda): Implements support to k>1 on cmux tree. --- src/vertical_packing.cuh | 205 ++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 122 deletions(-) diff --git a/src/vertical_packing.cuh b/src/vertical_packing.cuh index 96a83c147..a1ce2fc2c 100644 --- a/src/vertical_packing.cuh +++ b/src/vertical_packing.cuh @@ -15,19 +15,6 @@ #include "polynomial/polynomial_math.cuh" #include "utils/timer.cuh" -template __device__ void fft(double2 *output) { - // Switch to the FFT space - NSMFFT_direct>(output); - synchronize_threads_in_block(); -} - -template __device__ void ifft_inplace(double2 *data) { - synchronize_threads_in_block(); - - NSMFFT_inverse>(data); - synchronize_threads_in_block(); -} - /* * Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this * array, and an array of GGSW ciphertexts with a index to one ciphertext in it. @@ -59,121 +46,96 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in, uint32_t base_log, uint32_t level_count, uint32_t ggsw_idx) { // Define glwe_sub - Torus *glwe_sub_mask = (Torus *)selected_memory; - Torus *glwe_sub_body = (Torus *)glwe_sub_mask + (ptrdiff_t)polynomial_size; + Torus *glwe_sub = (Torus *)selected_memory; - double2 *mask_res_fft = (double2 *)glwe_sub_body + - polynomial_size / (sizeof(double2) / sizeof(Torus)); - double2 *body_res_fft = - (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2; + double2 *res_fft = + (double2 *)glwe_sub + + (glwe_dim + 1) * polynomial_size / (sizeof(double2) / sizeof(Torus)); double2 *glwe_fft = - (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2); + (double2 *)res_fft + (ptrdiff_t)((glwe_dim + 1) * polynomial_size / 2); ///////////////////////////////////// // glwe2-glwe1 - // Copy m0 to shared memory to preserve data - auto m0_mask = &glwe_array_in[input_idx1 * (glwe_dim + 1) * polynomial_size]; - auto m0_body = m0_mask + polynomial_size; + // Gets the pointers for the global memory + auto m0 = &glwe_array_in[input_idx1 * (glwe_dim + 1) * polynomial_size]; + auto m1 = &glwe_array_in[input_idx2 * (glwe_dim + 1) * polynomial_size]; - // Just gets the pointer for m1 on global memory - auto m1_mask = &glwe_array_in[input_idx2 * (glwe_dim + 1) * polynomial_size]; - auto m1_body = m1_mask + polynomial_size; - - // Mask - sub_polynomial(glwe_sub_mask, m1_mask, m0_mask); - // Body - sub_polynomial(glwe_sub_body, m1_body, m0_body); - - synchronize_threads_in_block(); + // Subtraction: m1-m0 + for (int i = 0; i < (glwe_dim + 1); i++) { + auto glwe_sub_slice = glwe_sub + i * params::degree; + auto m0_slice = m0 + i * params::degree; + auto m1_slice = m1 + i * params::degree; + sub_polynomial(glwe_sub_slice, m1_slice, m0_slice); + } // Initialize the polynomial multiplication via FFT arrays // The polynomial multiplications happens at the block level // and each thread handles two or more coefficients int pos = threadIdx.x; - for (int j = 0; j < params::opt / 2; j++) { - mask_res_fft[pos].x = 0; - mask_res_fft[pos].y = 0; - body_res_fft[pos].x = 0; - body_res_fft[pos].y = 0; + for (int j = 0; j < (glwe_dim + 1) * params::opt / 2; j++) { + res_fft[pos].x = 0; + res_fft[pos].y = 0; pos += params::degree / params::opt; } - GadgetMatrix gadget_mask(base_log, level_count, glwe_sub_mask, - 1); - GadgetMatrix gadget_body(base_log, level_count, glwe_sub_body, - 1); + synchronize_threads_in_block(); + GadgetMatrix gadget(base_log, level_count, glwe_sub, + glwe_dim + 1); // Subtract each glwe operand, decompose the resulting // polynomial coefficients to multiply each decomposed level // with the corresponding part of the LUT for (int level = level_count - 1; level >= 0; level--) { - // Decomposition - gadget_mask.decompose_and_compress_next(glwe_fft); - - // First, perform the polynomial multiplication for the mask - fft(glwe_fft); - - // External product and accumulate - // Get the piece necessary for the multiplication - auto mask_fourier = get_ith_mask_kth_block( - ggsw_in, ggsw_idx, 0, level, polynomial_size, glwe_dim, level_count); - auto body_fourier = get_ith_body_kth_block( - ggsw_in, ggsw_idx, 0, level, polynomial_size, glwe_dim, level_count); - + gadget.decompose_and_compress_next(glwe_fft); synchronize_threads_in_block(); + for (int i = 0; i < (glwe_dim + 1); i++) { + auto glwe_fft_slice = glwe_fft + i * params::degree / 2; - // Perform the coefficient-wise product + // First, perform the polynomial multiplication + NSMFFT_direct>(glwe_fft_slice); + + // External product and accumulate + // Get the piece necessary for the multiplication + auto bsk_slice = get_ith_mask_kth_block( + ggsw_in, ggsw_idx, i, level, polynomial_size, glwe_dim, level_count); + + synchronize_threads_in_block(); + // Perform the coefficient-wise product + for (int j = 0; j < (glwe_dim + 1); j++) { + auto bsk_poly = bsk_slice + j * params::degree / 2; + auto res_fft_poly = res_fft + j * params::degree / 2; + polynomial_product_accumulate_in_fourier_domain( + res_fft_poly, glwe_fft_slice, bsk_poly); + } + } synchronize_threads_in_block(); - polynomial_product_accumulate_in_fourier_domain( - mask_res_fft, glwe_fft, mask_fourier); - polynomial_product_accumulate_in_fourier_domain( - body_res_fft, glwe_fft, body_fourier); - - // Now handle the polynomial multiplication for the body - // in the same way - synchronize_threads_in_block(); - - gadget_body.decompose_and_compress_next(glwe_fft); - fft(glwe_fft); - - // External product and accumulate - // Get the piece necessary for the multiplication - mask_fourier = get_ith_mask_kth_block( - ggsw_in, ggsw_idx, 1, level, polynomial_size, glwe_dim, level_count); - body_fourier = get_ith_body_kth_block( - ggsw_in, ggsw_idx, 1, level, polynomial_size, glwe_dim, level_count); - - synchronize_threads_in_block(); - - polynomial_product_accumulate_in_fourier_domain( - mask_res_fft, glwe_fft, mask_fourier); - polynomial_product_accumulate_in_fourier_domain( - body_res_fft, glwe_fft, body_fourier); } // IFFT synchronize_threads_in_block(); - ifft_inplace(mask_res_fft); - ifft_inplace(body_res_fft); + for (int i = 0; i < (glwe_dim + 1); i++) { + auto res_fft_slice = res_fft + i * params::degree / 2; + NSMFFT_inverse>(res_fft_slice); + } synchronize_threads_in_block(); // Write the output - Torus *mb_mask = - &glwe_array_out[output_idx * (glwe_dim + 1) * polynomial_size]; - Torus *mb_body = mb_mask + polynomial_size; + Torus *mb = &glwe_array_out[output_idx * (glwe_dim + 1) * polynomial_size]; int tid = threadIdx.x; - for (int i = 0; i < params::opt; i++) { - mb_mask[tid] = m0_mask[tid]; - mb_body[tid] = m0_body[tid]; + for (int i = 0; i < (glwe_dim + 1) * params::opt; i++) { + mb[tid] = m0[tid]; tid += params::degree / params::opt; } - add_to_torus(mask_res_fft, mb_mask); - add_to_torus(body_res_fft, mb_body); + for (int i = 0; i < (glwe_dim + 1); i++) { + auto res_fft_slice = res_fft + i * params::degree / 2; + auto mb_slice = mb + i * params::degree; + add_to_torus(res_fft_slice, mb_slice); + } } // Appends zeroed paddings between each LUT @@ -187,8 +149,9 @@ __host__ void add_padding_to_lut_async(Torus *lut_out, Torus *lut_in, *stream)); for (int i = 0; i < num_lut; i++) check_cuda_error(cudaMemcpyAsync( - lut_out + (2 * i + 1) * params::degree, lut_in + i * params::degree, - params::degree * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream)); + lut_out + ((glwe_dimension + 1) * i + glwe_dimension) * params::degree, + lut_in + i * params::degree, params::degree * sizeof(Torus), + cudaMemcpyDeviceToDevice, *stream)); } /** @@ -221,6 +184,9 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in, int tree_idx = blockIdx.y; int tree_offset = tree_idx * num_lut * (glwe_dim + 1) * polynomial_size; + auto block_glwe_array_out = glwe_array_out + tree_offset; + auto block_glwe_array_in = glwe_array_in + tree_offset; + // The x-axis handles a single cmux tree. Each block computes one cmux. int cmux_idx = blockIdx.x; int output_idx = cmux_idx; @@ -237,20 +203,21 @@ __global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in, selected_memory = &device_mem[(blockIdx.x + blockIdx.y * gridDim.x) * device_memory_size_per_block]; - cmux( - glwe_array_out + tree_offset, glwe_array_in + tree_offset, ggsw_in, - selected_memory, output_idx, input_idx1, input_idx2, glwe_dim, - polynomial_size, base_log, level_count, ggsw_idx); + cmux(block_glwe_array_out, block_glwe_array_in, + ggsw_in, selected_memory, output_idx, input_idx1, + input_idx2, glwe_dim, polynomial_size, base_log, + level_count, ggsw_idx); } template __host__ __device__ int -get_memory_needed_per_block_cmux_tree(uint32_t polynomial_size) { - return sizeof(Torus) * polynomial_size + // glwe_sub_mask - sizeof(Torus) * polynomial_size + // glwe_sub_body - sizeof(double2) * polynomial_size / 2 + // mask_res_fft - sizeof(double2) * polynomial_size / 2 + // body_res_fft - sizeof(double2) * polynomial_size / 2; // glwe_fft +get_memory_needed_per_block_cmux_tree(uint32_t glwe_dimension, + uint32_t polynomial_size) { + return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // glwe_sub + sizeof(double2) * polynomial_size / 2 * + (glwe_dimension + 1) + // res_fft + sizeof(double2) * polynomial_size / 2 * + (glwe_dimension + 1); // glwe_fft } template @@ -259,8 +226,8 @@ get_buffer_size_cmux_tree(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t r, uint32_t tau, uint32_t max_shared_memory) { - int memory_needed_per_block = - get_memory_needed_per_block_cmux_tree(polynomial_size); + int memory_needed_per_block = get_memory_needed_per_block_cmux_tree( + glwe_dimension, polynomial_size); int num_lut = (1 << r); int ggsw_size = polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * level_count; @@ -286,8 +253,8 @@ scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer, cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); - int memory_needed_per_block = - get_memory_needed_per_block_cmux_tree(polynomial_size); + int memory_needed_per_block = get_memory_needed_per_block_cmux_tree( + glwe_dimension, polynomial_size); if (max_shared_memory >= memory_needed_per_block) { check_cuda_error(cudaFuncSetAttribute( device_batch_cmux, @@ -341,8 +308,8 @@ host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, return; } - int memory_needed_per_block = - get_memory_needed_per_block_cmux_tree(polynomial_size); + int memory_needed_per_block = get_memory_needed_per_block_cmux_tree( + glwe_dimension, polynomial_size); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -504,16 +471,10 @@ __global__ void device_blind_rotation_and_sample_extraction( template __host__ __device__ int get_memory_needed_per_block_blind_rotation_sample_extraction( - uint32_t polynomial_size) { - return sizeof(Torus) * polynomial_size + // accumulator_c0 mask - sizeof(Torus) * polynomial_size + // accumulator_c0 body - sizeof(Torus) * polynomial_size + // accumulator_c1 mask - sizeof(Torus) * polynomial_size + // accumulator_c1 body - sizeof(Torus) * polynomial_size + // glwe_sub_mask - sizeof(Torus) * polynomial_size + // glwe_sub_body - sizeof(double2) * polynomial_size / 2 + // mask_res_fft - sizeof(double2) * polynomial_size / 2 + // body_res_fft - sizeof(double2) * polynomial_size / 2; // glwe_fft + uint32_t glwe_dimension, uint32_t polynomial_size) { + return sizeof(Torus) * polynomial_size * (glwe_dimension+1) + // accumulator_c0 + sizeof(Torus) * polynomial_size * (glwe_dimension+1) + // accumulator_c1 + + get_memory_needed_per_block_cmux_tree(glwe_dimension, polynomial_size); } template @@ -523,7 +484,7 @@ __host__ __device__ int get_buffer_size_blind_rotation_sample_extraction( int memory_needed_per_block = get_memory_needed_per_block_blind_rotation_sample_extraction( - polynomial_size); + glwe_dimension, polynomial_size); int device_mem = 0; if (max_shared_memory < memory_needed_per_block) { device_mem = memory_needed_per_block * tau; @@ -548,7 +509,7 @@ __host__ void scratch_blind_rotation_sample_extraction( int memory_needed_per_block = get_memory_needed_per_block_blind_rotation_sample_extraction( - polynomial_size); + glwe_dimension, polynomial_size); if (max_shared_memory >= memory_needed_per_block) { check_cuda_error(cudaFuncSetAttribute( device_blind_rotation_and_sample_extraction( - polynomial_size); + glwe_dimension, polynomial_size); // Prepare the buffers int ggsw_size = polynomial_size * (glwe_dimension + 1) *