diff --git a/include/bootstrap.h b/include/bootstrap.h index 5ff93bc1e..da28cde2b 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -113,6 +113,18 @@ void cuda_circuit_bootstrap_vertical_packing_64( uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory); + +void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, + void *lwe_array_in, void *lut_vector, void *fourier_bsk, + void *ksk, void *cbs_fpksk, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, + uint32_t base_log_pksk, uint32_t level_count_pksk, + uint32_t base_log_cbs, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, + uint32_t number_of_inputs, uint32_t max_shared_memory); } #ifdef __CUDACC__ diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu new file mode 100644 index 000000000..f7963d642 --- /dev/null +++ b/src/bit_extraction.cu @@ -0,0 +1,195 @@ +#include "bit_extraction.cuh" + +void cuda_extract_bits_32( + void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, + void *lwe_array_in, void *lwe_array_in_buffer, + void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer, + void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes, + void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, + uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, + uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { + assert(("Error (GPU extract bits): base log should be <= 32", + base_log_bsk <= 32)); + assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU extract bits): lwe_dimension_in should be one of " + "512, 1024, 2048, 4096, 8192", + lwe_dimension_in == 512 || lwe_dimension_in == 1024 || + lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || + lwe_dimension_in == 8192)); + // The number of samples should be lower than 4 time the number of streaming + // multiprocessors divided by ((k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk)); + + switch (lwe_dimension_in) { + case 512: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 1024: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 2048: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 4096: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 8192: + host_extract_bits>( + v_stream, gpu_index, (uint32_t *)list_lwe_array_out, + (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, + (uint32_t *)lwe_array_in_shifted_buffer, + (uint32_t *)lwe_array_out_ks_buffer, + (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + default: + break; + } +} + +void cuda_extract_bits_64( + void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, + void *lwe_array_in, void *lwe_array_in_buffer, + void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer, + void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes, + void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, + uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, + uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { + assert(("Error (GPU extract bits): base log should be <= 64", + base_log_bsk <= 64)); + assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU extract bits): lwe_dimension_in should be one of " + "512, 1024, 2048, 4096, 8192", + lwe_dimension_in == 512 || lwe_dimension_in == 1024 || + lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || + lwe_dimension_in == 8192)); + // The number of samples should be lower than four time the number of + // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being + // related to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk)); + + switch (lwe_dimension_in) { + case 512: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 1024: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 2048: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 4096: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + case 8192: + host_extract_bits>( + v_stream, gpu_index, (uint64_t *)list_lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, + (uint64_t *)lwe_array_in_shifted_buffer, + (uint64_t *)lwe_array_out_ks_buffer, + (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, + (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, + number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_samples, max_shared_memory); + break; + default: + break; + } +} diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh new file mode 100644 index 000000000..37fbfc3a1 --- /dev/null +++ b/src/bit_extraction.cuh @@ -0,0 +1,184 @@ +#ifndef BIT_EXTRACT_H +#define BIT_EXTRACT_H + +#include "cooperative_groups.h" + +#include "../include/helper_cuda.h" +#include "bootstrap.h" +#include "bootstrap_low_latency.cuh" +#include "device.h" +#include "keyswitch.cuh" +#include "polynomial/parameters.cuh" +#include "utils/timer.cuh" + +// only works for big lwe for ks+bs case +// state_lwe_buffer is copied from big lwe input +// shifted_lwe_buffer is scalar multiplication of lwe input +// blockIdx.x refers to input ciphertext id +template +__global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift, + Torus *src, Torus value) { + int blockId = blockIdx.x; + int tid = threadIdx.x; + auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)]; + auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)]; + auto cur_src = &src[blockId * (params::degree + 1)]; + +#pragma unroll + for (int i = 0; i < params::opt; i++) { + cur_dst_copy[tid] = cur_src[tid]; + cur_dst_shift[tid] = cur_src[tid] * value; + tid += params::degree / params::opt; + } + + if (threadIdx.x == params::degree / params::opt - 1) { + cur_dst_copy[params::degree] = cur_src[params::degree]; + cur_dst_shift[params::degree] = cur_src[params::degree] * value; + } +} + +// only works for small lwe in ks+bs case +// function copies lwe when length is not a power of two +template +__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, + uint32_t number_of_bits, uint32_t lwe_id) { + + size_t blockId = blockIdx.x; + size_t threads_per_block = blockDim.x; + size_t opt = small_lwe_size / threads_per_block; + size_t rem = small_lwe_size & (threads_per_block - 1); + + auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits]; + auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size]; + auto cur_src = &src[blockId * small_lwe_size]; + + size_t tid = threadIdx.x; + for (int i = 0; i < opt; i++) { + cur_dst[tid] = cur_src[tid]; + tid += threads_per_block; + } + + if (threadIdx.x < rem) + cur_dst[tid] = cur_src[tid]; +} + +// only used in extract bits for one ciphertext +// should be called with one block and one thread +// NOTE: check if putting this functionality in copy_small_lwe or +// fill_pbs_lut vector is faster +template +__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) { + lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value; +} + +// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 +// if the extracted bit was 0 and 1 in the other case +// +// Remove the extracted bit from the state LWE to get a 0 at the extracted bit +// location. +// +// Shift on padding bit for next iteration, that's why +// alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used +// instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1) +template +__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe, + Torus *pbs_lwe_array_out, Torus add_value, + Torus mul_value) { + size_t tid = threadIdx.x; + size_t blockId = blockIdx.x; + auto cur_shifted_lwe = &shifted_lwe[blockId * (params::degree + 1)]; + auto cur_state_lwe = &state_lwe[blockId * (params::degree + 1)]; + auto cur_pbs_lwe_array_out = + &pbs_lwe_array_out[blockId * (params::degree + 1)]; +#pragma unroll + for (int i = 0; i < params::opt; i++) { + cur_shifted_lwe[tid] = cur_state_lwe[tid] -= cur_pbs_lwe_array_out[tid]; + cur_shifted_lwe[tid] *= mul_value; + tid += params::degree / params::opt; + } + + if (threadIdx.x == params::degree / params::opt - 1) { + cur_shifted_lwe[params::degree] = cur_state_lwe[params::degree] -= + (cur_pbs_lwe_array_out[params::degree] + add_value); + cur_shifted_lwe[params::degree] *= mul_value; + } +} + +// Fill lut(only body) for the current bit (equivalent to trivial encryption as +// mask is 0s) +// The LUT is filled with value +template +__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) { + + Torus *cur_poly = &lut[blockIdx.x * 2 * params::degree + params::degree]; + size_t tid = threadIdx.x; +#pragma unroll + for (int i = 0; i < params::opt; i++) { + cur_poly[tid] = value; + tid += params::degree / params::opt; + } +} + +template +__host__ void host_extract_bits( + void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out, + Torus *lwe_array_in, Torus *lwe_array_in_buffer, + Torus *lwe_array_in_shifted_buffer, Torus *lwe_array_out_ks_buffer, + Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, + uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, + uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, + uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { + + auto stream = static_cast(v_stream); + uint32_t ciphertext_n_bits = sizeof(Torus) * 8; + + int blocks = 1; + int threads = params::degree / params::opt; + + copy_and_shift_lwe<<>>( + lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in, + 1ll << (ciphertext_n_bits - delta_log - 1)); + checkCudaErrors(cudaGetLastError()); + + for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) { + cuda_keyswitch_lwe_ciphertext_vector( + v_stream, gpu_index, lwe_array_out_ks_buffer, + lwe_array_in_shifted_buffer, ksk, lwe_dimension_in, lwe_dimension_out, + base_log_ksk, level_count_ksk, 1); + + copy_small_lwe<<<1, 256, 0, *stream>>>( + list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1, + number_of_bits, number_of_bits - bit_idx - 1); + checkCudaErrors(cudaGetLastError()); + + if (bit_idx == number_of_bits - 1) { + break; + } + + add_to_body<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, + lwe_dimension_out, + 1ll << (ciphertext_n_bits - 2)); + checkCudaErrors(cudaGetLastError()); + + fill_lut_body_for_current_bit + <<>>( + lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx)); + checkCudaErrors(cudaGetLastError()); + + host_bootstrap_low_latency( + v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs, + lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, + lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk, + number_of_samples, 1, max_shared_memory); + + add_sub_and_mul_lwe<<<1, threads, 0, *stream>>>( + lwe_array_in_shifted_buffer, lwe_array_in_buffer, + lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx), + 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)); + checkCudaErrors(cudaGetLastError()); + } +} + +#endif // BIT_EXTRACT_H diff --git a/src/bootstrap_wop.cu b/src/bootstrap_wop.cu deleted file mode 100644 index 9aa150405..000000000 --- a/src/bootstrap_wop.cu +++ /dev/null @@ -1,599 +0,0 @@ -#include "bootstrap_wop.cuh" - -void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, - void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory) { - - assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32)); - assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " - "2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); - // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " - "be >= 1 ", - r >= 1)); - - switch (polynomial_size) { - case 512: - host_cmux_tree>( - v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 1024: - host_cmux_tree>( - v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 2048: - host_cmux_tree>( - v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 4096: - host_cmux_tree>( - v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 8192: - host_cmux_tree>( - v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, - (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - default: - break; - } -} - -void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, - void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t r, uint32_t tau, - uint32_t max_shared_memory) { - - assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64)); - assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " - "2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); - // For larger k we will need to adjust the mask size - assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " - "be >= 1 ", - r >= 1)); - - switch (polynomial_size) { - case 512: - host_cmux_tree>( - v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 1024: - host_cmux_tree>( - v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 2048: - host_cmux_tree>( - v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 4096: - host_cmux_tree>( - v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - case 8192: - host_cmux_tree>( - v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, - level_count, r, tau, max_shared_memory); - break; - default: - break; - } -} - -void cuda_extract_bits_32( - void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, - void *lwe_array_in, void *lwe_array_in_buffer, - void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer, - void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes, - void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, - uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, - uint32_t max_shared_memory) { - assert(("Error (GPU extract bits): base log should be <= 32", - base_log_bsk <= 32)); - assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU extract bits): lwe_dimension_in should be one of " - "512, 1024, 2048, 4096, 8192", - lwe_dimension_in == 512 || lwe_dimension_in == 1024 || - lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || - lwe_dimension_in == 8192)); - // The number of samples should be lower than 4 time the number of streaming - // multiprocessors divided by ((k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk)); - - switch (lwe_dimension_in) { - case 512: - host_extract_bits>( - v_stream, gpu_index, (uint32_t *)list_lwe_array_out, - (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, - (uint32_t *)lwe_array_in_shifted_buffer, - (uint32_t *)lwe_array_out_ks_buffer, - (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 1024: - host_extract_bits>( - v_stream, gpu_index, (uint32_t *)list_lwe_array_out, - (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, - (uint32_t *)lwe_array_in_shifted_buffer, - (uint32_t *)lwe_array_out_ks_buffer, - (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 2048: - host_extract_bits>( - v_stream, gpu_index, (uint32_t *)list_lwe_array_out, - (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, - (uint32_t *)lwe_array_in_shifted_buffer, - (uint32_t *)lwe_array_out_ks_buffer, - (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 4096: - host_extract_bits>( - v_stream, gpu_index, (uint32_t *)list_lwe_array_out, - (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, - (uint32_t *)lwe_array_in_shifted_buffer, - (uint32_t *)lwe_array_out_ks_buffer, - (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 8192: - host_extract_bits>( - v_stream, gpu_index, (uint32_t *)list_lwe_array_out, - (uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer, - (uint32_t *)lwe_array_in_shifted_buffer, - (uint32_t *)lwe_array_out_ks_buffer, - (uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - default: - break; - } -} - -void cuda_extract_bits_64( - void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, - void *lwe_array_in, void *lwe_array_in_buffer, - void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer, - void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes, - void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log, - uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, - uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, - uint32_t max_shared_memory) { - assert(("Error (GPU extract bits): base log should be <= 64", - base_log_bsk <= 64)); - assert(("Error (GPU extract bits): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU extract bits): lwe_dimension_in should be one of " - "512, 1024, 2048, 4096, 8192", - lwe_dimension_in == 512 || lwe_dimension_in == 1024 || - lwe_dimension_in == 2048 || lwe_dimension_in == 4096 || - lwe_dimension_in == 8192)); - // The number of samples should be lower than four time the number of - // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being - // related to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_samples <= number_of_sm * 4. / 2. / level_count_bsk)); - - switch (lwe_dimension_in) { - case 512: - host_extract_bits>( - v_stream, gpu_index, (uint64_t *)list_lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, - (uint64_t *)lwe_array_in_shifted_buffer, - (uint64_t *)lwe_array_out_ks_buffer, - (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 1024: - host_extract_bits>( - v_stream, gpu_index, (uint64_t *)list_lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, - (uint64_t *)lwe_array_in_shifted_buffer, - (uint64_t *)lwe_array_out_ks_buffer, - (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 2048: - host_extract_bits>( - v_stream, gpu_index, (uint64_t *)list_lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, - (uint64_t *)lwe_array_in_shifted_buffer, - (uint64_t *)lwe_array_out_ks_buffer, - (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 4096: - host_extract_bits>( - v_stream, gpu_index, (uint64_t *)list_lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, - (uint64_t *)lwe_array_in_shifted_buffer, - (uint64_t *)lwe_array_out_ks_buffer, - (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - case 8192: - host_extract_bits>( - v_stream, gpu_index, (uint64_t *)list_lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer, - (uint64_t *)lwe_array_in_shifted_buffer, - (uint64_t *)lwe_array_out_ks_buffer, - (uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs, - (uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk, - number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out, - base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, - number_of_samples, max_shared_memory); - break; - default: - break; - } -} - -void cuda_blind_rotate_and_sample_extraction_64( - void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in, - void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, - uint32_t max_shared_memory) { - - switch (polynomial_size) { - case 512: - host_blind_rotate_and_sample_extraction>( - v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, - base_log, l_gadget, max_shared_memory); - break; - case 1024: - host_blind_rotate_and_sample_extraction>( - v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, - base_log, l_gadget, max_shared_memory); - break; - case 2048: - host_blind_rotate_and_sample_extraction>( - v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, - base_log, l_gadget, max_shared_memory); - break; - case 4096: - host_blind_rotate_and_sample_extraction>( - v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, - base_log, l_gadget, max_shared_memory); - break; - case 8192: - host_blind_rotate_and_sample_extraction>( - v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, - (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, - base_log, l_gadget, max_shared_memory); - break; - } -} - -void cuda_circuit_bootstrap_32( - void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, - void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, - void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer, - void *lwe_array_in_fp_ks_buffer, uint32_t delta_log, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, - uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, - uint32_t number_of_samples, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); - // The number of samples should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_samples <= number_of_sm / 4. / 2. / level_bsk)); - switch (polynomial_size) { - case 512: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, - (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, - (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 1024: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, - (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, - (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 2048: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, - (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, - (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 4096: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, - (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, - (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 8192: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, - (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, - (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - default: - break; - } -} - -void cuda_circuit_bootstrap_64( - void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, - void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, - void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer, - void *lwe_array_in_fp_ks_buffer, uint32_t delta_log, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, - uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, - uint32_t number_of_samples, uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); - // The number of samples should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_samples <= number_of_sm / 4. / 2. / level_bsk)); - // The number of samples should be lower than the number of streaming - switch (polynomial_size) { - case 512: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, - (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, - (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 1024: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, - (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, - (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 2048: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, - (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, - (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 4096: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, - (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, - (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - case 8192: - host_circuit_bootstrap>( - v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, - (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, - (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, - (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, - (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, - glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, - base_log_pksk, level_cbs, base_log_cbs, number_of_samples, - max_shared_memory); - break; - default: - break; - } -} - -void cuda_circuit_bootstrap_vertical_packing_64( - void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *fourier_bsk, void *cbs_fpksk, void *lut_vector, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk, - uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, - uint32_t number_of_inputs, uint32_t lut_number, - uint32_t max_shared_memory) { - assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", - glwe_dimension == 1)); - assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " - "512, 1024, 2048, 4096, 8192", - polynomial_size == 512 || polynomial_size == 1024 || - polynomial_size == 2048 || polynomial_size == 4096 || - polynomial_size == 8192)); - // The number of inputs should be lower than the number of streaming - // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related - // to the occupancy of 50%). The only supported value for k is 1, so - // k + 1 = 2 for now. - int number_of_sm = 0; - cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); - assert(("Error (GPU extract bits): the number of input LWEs must be lower or " - "equal to the " - "number of streaming multiprocessors on the device divided by 8 * " - "level_count_bsk", - number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); - switch (polynomial_size) { - case 512: - host_circuit_bootstrap_vertical_packing>( - v_stream, gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); - break; - case 1024: - host_circuit_bootstrap_vertical_packing>( - v_stream, gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); - break; - case 2048: - host_circuit_bootstrap_vertical_packing>( - v_stream, gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); - break; - case 4096: - host_circuit_bootstrap_vertical_packing>( - v_stream, gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); - break; - case 8192: - host_circuit_bootstrap_vertical_packing>( - v_stream, gpu_index, (uint64_t *)lwe_array_out, - (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); - break; - default: - break; - } -} diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu new file mode 100644 index 000000000..d43d531cc --- /dev/null +++ b/src/circuit_bootstrap.cu @@ -0,0 +1,178 @@ +#include "circuit_bootstrap.cuh" + +void cuda_circuit_bootstrap_32( + void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, + void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, + void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer, + void *lwe_array_in_fp_ks_buffer, uint32_t delta_log, + uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, + uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, + uint32_t number_of_samples, uint32_t max_shared_memory) { + assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " + "512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_samples <= number_of_sm / 4. / 2. / level_bsk)); + switch (polynomial_size) { + case 512: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, + (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, + (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 1024: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, + (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, + (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 2048: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, + (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, + (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 4096: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, + (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, + (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 8192: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array, + (uint32_t *)lwe_array_in_shifted_buffer, (uint32_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_out_pbs_buffer, + (uint32_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + default: + break; + } +} + +void cuda_circuit_bootstrap_64( + void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, + void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, + void *lut_vector, void *lut_vector_indexes, void *lwe_array_out_pbs_buffer, + void *lwe_array_in_fp_ks_buffer, uint32_t delta_log, + uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, + uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, + uint32_t number_of_samples, uint32_t max_shared_memory) { + assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " + "512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // The number of samples should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_samples <= number_of_sm / 4. / 2. / level_bsk)); + // The number of samples should be lower than the number of streaming + switch (polynomial_size) { + case 512: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, + (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 1024: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, + (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 2048: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, + (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 4096: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, + (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + case 8192: + host_circuit_bootstrap>( + v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in, + (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array, + (uint64_t *)lwe_array_in_shifted_buffer, (uint64_t *)lut_vector, + (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_out_pbs_buffer, + (uint64_t *)lwe_array_in_fp_ks_buffer, delta_log, polynomial_size, + glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk, + base_log_pksk, level_cbs, base_log_cbs, number_of_samples, + max_shared_memory); + break; + default: + break; + } +} diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh new file mode 100644 index 000000000..71419b3cc --- /dev/null +++ b/src/circuit_bootstrap.cuh @@ -0,0 +1,139 @@ +#ifndef CBS_H +#define CBS_H + +#include "../include/helper_cuda.h" +#include "bit_extraction.cuh" +#include "bootstrap.h" +#include "bootstrap_amortized.cuh" +#include "device.h" +#include "keyswitch.cuh" +#include "polynomial/parameters.cuh" +#include "utils/timer.cuh" + +// works for lwe with generic sizes +// shifted_lwe_buffer is scalar multiplication of lwe input +// blockIdx.x refers to input ciphertext id +template +__global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value, + size_t lwe_size) { + + size_t blockId = blockIdx.y * gridDim.x + blockIdx.x; + size_t threads_per_block = blockDim.x; + size_t opt = lwe_size / threads_per_block; + size_t rem = lwe_size & (threads_per_block - 1); + + auto cur_dst = &dst_shift[blockId * lwe_size]; + auto cur_src = &src[blockIdx.y * lwe_size]; + + size_t tid = threadIdx.x; + for (size_t i = 0; i < opt; i++) { + cur_dst[tid] = cur_src[tid] * value; + tid += threads_per_block; + } + + if (threadIdx.x < rem) + cur_dst[tid] = cur_src[tid] * value; +} + +// Fill lut (equivalent to trivial encryption as mask is 0s) +// The LUT is filled with -alpha in each coefficient where +// alpha = 2^{log(q) - 1 - base_log * level} +template +__global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits, + uint32_t base_log_cbs) { + + Torus *cur_mask = &lut[blockIdx.x * 2 * params::degree]; + Torus *cur_poly = &lut[blockIdx.x * 2 * params::degree + params::degree]; + size_t tid = threadIdx.x; +#pragma unroll + for (int i = 0; i < params::opt; i++) { + cur_mask[tid] = 0; + cur_poly[tid] = + 0ll - + (1ll << (ciphertext_n_bits - 1 - base_log_cbs * (blockIdx.x + 1))); + tid += params::degree / params::opt; + } +} + +template +__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src, + uint32_t ciphertext_n_bits, + uint32_t base_log_cbs, uint32_t level_cbs) { + size_t tid = threadIdx.x; + size_t dst_lwe_id = blockIdx.x; + size_t src_lwe_id = dst_lwe_id / 2; + size_t cur_cbs_level = src_lwe_id % level_cbs + 1; + + auto cur_src = &lwe_src[src_lwe_id * (params::degree + 1)]; + auto cur_dst = &lwe_dst[dst_lwe_id * (params::degree + 1)]; +#pragma unroll + for (int i = 0; i < params::opt; i++) { + cur_dst[tid] = cur_src[tid]; + tid += params::degree / params::opt; + } + Torus val = 1ll << (ciphertext_n_bits - 1 - base_log_cbs * cur_cbs_level); + if (threadIdx.x == 0) { + cur_dst[params::degree] = cur_src[params::degree] + val; + } +} + +template +__host__ void host_circuit_bootstrap( + void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in, + double2 *fourier_bsk, Torus *fp_ksk_array, + Torus *lwe_array_in_shifted_buffer, Torus *lut_vector, + uint32_t *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer, + Torus *lwe_array_in_fp_ks_buffer, uint32_t delta_log, + uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, + uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, + uint32_t number_of_samples, uint32_t max_shared_memory) { + auto stream = static_cast(v_stream); + + uint32_t ciphertext_n_bits = sizeof(Torus) * 8; + uint32_t lwe_size = lwe_dimension + 1; + int pbs_count = number_of_samples * level_cbs; + + dim3 blocks(level_cbs, number_of_samples, 1); + int threads = 256; + + // Shift message LSB on padding bit, at this point we expect to have messages + // with only 1 bit of information + shift_lwe_cbs<<>>( + lwe_array_in_shifted_buffer, lwe_array_in, + 1LL << (ciphertext_n_bits - delta_log - 1), lwe_size); + + // Add q/4 to center the error while computing a negacyclic LUT + add_to_body + <<>>(lwe_array_in_shifted_buffer, lwe_dimension, + 1ll << (ciphertext_n_bits - 2)); + // Fill lut (equivalent to trivial encryption as mask is 0s) + // The LUT is filled with -alpha in each coefficient where + // alpha = 2^{log(q) - 1 - base_log * level} + fill_lut_body_for_cbs + <<>>( + lut_vector, ciphertext_n_bits, base_log_cbs); + + // Applying a negacyclic LUT on a ciphertext with one bit of message in the + // MSB and no bit of padding + host_bootstrap_amortized( + v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector, + lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, + lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count, + level_cbs, 0, max_shared_memory); + + dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1); + dim3 copy_block(params::degree / params::opt, 1, 1); + // Add q/4 to center the error while computing a negacyclic LUT + // copy pbs result (glwe_dimension + 1) times to be an input of fp-ks + copy_add_lwe_cbs<<>>( + lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits, + base_log_cbs, level_cbs); + + cuda_fp_keyswitch_lwe_to_glwe( + v_stream, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array, + polynomial_size, glwe_dimension, polynomial_size, base_log_pksk, + level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1); +} + +#endif // CBS_H diff --git a/src/crypto/ggsw.cuh b/src/crypto/ggsw.cuh index d258b1e8e..504c4ac81 100644 --- a/src/crypto/ggsw.cuh +++ b/src/crypto/ggsw.cuh @@ -1,6 +1,9 @@ #ifndef CONCRETE_CORE_GGSW_CUH #define CONCRETE_CORE_GGSW_CUH +#include "device.h" +#include "polynomial/parameters.cuh" + template __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src, char *device_mem) { diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu new file mode 100644 index 000000000..1f5499698 --- /dev/null +++ b/src/vertical_packing.cu @@ -0,0 +1,151 @@ +#include "vertical_packing.cuh" + +void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory) { + + assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32)); + assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " + "be >= 1 ", + r >= 1)); + + switch (polynomial_size) { + case 512: + host_cmux_tree>( + v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 1024: + host_cmux_tree>( + v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 2048: + host_cmux_tree>( + v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 4096: + host_cmux_tree>( + v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 8192: + host_cmux_tree>( + v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in, + (uint32_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + default: + break; + } +} + +void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, + void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t r, uint32_t tau, + uint32_t max_shared_memory) { + + assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64)); + assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, " + "2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // For larger k we will need to adjust the mask size + assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should " + "be >= 1 ", + r >= 1)); + + switch (polynomial_size) { + case 512: + host_cmux_tree>( + v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 1024: + host_cmux_tree>( + v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 2048: + host_cmux_tree>( + v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 4096: + host_cmux_tree>( + v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + case 8192: + host_cmux_tree>( + v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, glwe_dimension, polynomial_size, base_log, + level_count, r, tau, max_shared_memory); + break; + default: + break; + } +} + +void cuda_blind_rotate_and_sample_extraction_64( + void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in, + void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget, + uint32_t max_shared_memory) { + + switch (polynomial_size) { + case 512: + host_blind_rotate_and_sample_extraction>( + v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, + base_log, l_gadget, max_shared_memory); + break; + case 1024: + host_blind_rotate_and_sample_extraction>( + v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, + base_log, l_gadget, max_shared_memory); + break; + case 2048: + host_blind_rotate_and_sample_extraction>( + v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, + base_log, l_gadget, max_shared_memory); + break; + case 4096: + host_blind_rotate_and_sample_extraction>( + v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, + base_log, l_gadget, max_shared_memory); + break; + case 8192: + host_blind_rotate_and_sample_extraction>( + v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in, + (uint64_t *)lut_vector, mbr_size, tau, glwe_dimension, polynomial_size, + base_log, l_gadget, max_shared_memory); + break; + } +} diff --git a/src/bootstrap_wop.cuh b/src/vertical_packing.cuh similarity index 54% rename from src/bootstrap_wop.cuh rename to src/vertical_packing.cuh index c93f037ba..4607698e9 100644 --- a/src/bootstrap_wop.cuh +++ b/src/vertical_packing.cuh @@ -1,25 +1,20 @@ -#ifndef WOP_PBS_H -#define WOP_PBS_H - -#include "cooperative_groups.h" +#ifndef VERTICAL_PACKING_H +#define VERTICAL_PACKING_H #include "../include/helper_cuda.h" #include "bootstrap.h" -#include "bootstrap_amortized.cuh" -#include "bootstrap_low_latency.cuh" #include "complex/operations.cuh" +#include "crypto/gadget.cuh" #include "crypto/ggsw.cuh" #include "crypto/torus.cuh" #include "device.h" #include "fft/bnsmfft.cuh" #include "fft/smfft.cuh" #include "fft/twiddles.cuh" -#include "keyswitch.cuh" #include "polynomial/functions.cuh" #include "polynomial/parameters.cuh" #include "polynomial/polynomial.cuh" #include "polynomial/polynomial_math.cuh" -#include "utils/kernel_dimensions.cuh" #include "utils/memory.cuh" #include "utils/timer.cuh" @@ -383,243 +378,6 @@ void host_cmux_tree(void *v_stream, uint32_t gpu_index, Torus *glwe_array_out, cuda_drop_async(d_mem, stream, gpu_index); } -// only works for big lwe for ks+bs case -// state_lwe_buffer is copied from big lwe input -// shifted_lwe_buffer is scalar multiplication of lwe input -// blockIdx.x refers to input ciphertext id -template -__global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift, - Torus *src, Torus value) { - int blockId = blockIdx.x; - int tid = threadIdx.x; - auto cur_dst_copy = &dst_copy[blockId * (params::degree + 1)]; - auto cur_dst_shift = &dst_shift[blockId * (params::degree + 1)]; - auto cur_src = &src[blockId * (params::degree + 1)]; - -#pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_dst_copy[tid] = cur_src[tid]; - cur_dst_shift[tid] = cur_src[tid] * value; - tid += params::degree / params::opt; - } - - if (threadIdx.x == params::degree / params::opt - 1) { - cur_dst_copy[params::degree] = cur_src[params::degree]; - cur_dst_shift[params::degree] = cur_src[params::degree] * value; - } -} - -// works for lwe with generic sizes -// shifted_lwe_buffer is scalar multiplication of lwe input -// blockIdx.x refers to input ciphertext id -template -__global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value, - size_t lwe_size) { - - size_t blockId = blockIdx.y * gridDim.x + blockIdx.x; - size_t threads_per_block = blockDim.x; - size_t opt = lwe_size / threads_per_block; - size_t rem = lwe_size & (threads_per_block - 1); - - auto cur_dst = &dst_shift[blockId * lwe_size]; - auto cur_src = &src[blockIdx.y * lwe_size]; - - size_t tid = threadIdx.x; - for (size_t i = 0; i < opt; i++) { - cur_dst[tid] = cur_src[tid] * value; - tid += threads_per_block; - } - - if (threadIdx.x < rem) - cur_dst[tid] = cur_src[tid] * value; -} - -// only works for small lwe in ks+bs case -// function copies lwe when length is not a power of two -template -__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, - uint32_t number_of_bits, uint32_t lwe_id) { - - size_t blockId = blockIdx.x; - size_t threads_per_block = blockDim.x; - size_t opt = small_lwe_size / threads_per_block; - size_t rem = small_lwe_size & (threads_per_block - 1); - - auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits]; - auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size]; - auto cur_src = &src[blockId * small_lwe_size]; - - size_t tid = threadIdx.x; - for (int i = 0; i < opt; i++) { - cur_dst[tid] = cur_src[tid]; - tid += threads_per_block; - } - - if (threadIdx.x < rem) - cur_dst[tid] = cur_src[tid]; -} - -// only used in extract bits for one ciphertext -// should be called with one block and one thread -// NOTE: check if putting this functionality in copy_small_lwe or -// fill_pbs_lut vector is faster -template -__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) { - lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value; -} - -// Fill lut(only body) for the current bit (equivalent to trivial encryption as -// mask is 0s) -// The LUT is filled with value -template -__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) { - - Torus *cur_poly = &lut[blockIdx.x * 2 * params::degree + params::degree]; - size_t tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_poly[tid] = value; - tid += params::degree / params::opt; - } -} - -// Fill lut (equivalent to trivial encryption as mask is 0s) -// The LUT is filled with -alpha in each coefficient where -// alpha = 2^{log(q) - 1 - base_log * level} -template -__global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits, - uint32_t base_log_cbs) { - - Torus *cur_mask = &lut[blockIdx.x * 2 * params::degree]; - Torus *cur_poly = &lut[blockIdx.x * 2 * params::degree + params::degree]; - size_t tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_mask[tid] = 0; - cur_poly[tid] = - 0ll - - (1ll << (ciphertext_n_bits - 1 - base_log_cbs * (blockIdx.x + 1))); - tid += params::degree / params::opt; - } -} - -template -__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src, - uint32_t ciphertext_n_bits, - uint32_t base_log_cbs, uint32_t level_cbs) { - size_t tid = threadIdx.x; - size_t dst_lwe_id = blockIdx.x; - size_t src_lwe_id = dst_lwe_id / 2; - size_t cur_cbs_level = src_lwe_id % level_cbs + 1; - - auto cur_src = &lwe_src[src_lwe_id * (params::degree + 1)]; - auto cur_dst = &lwe_dst[dst_lwe_id * (params::degree + 1)]; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_dst[tid] = cur_src[tid]; - tid += params::degree / params::opt; - } - Torus val = 1ll << (ciphertext_n_bits - 1 - base_log_cbs * cur_cbs_level); - if (threadIdx.x == 0) { - cur_dst[params::degree] = cur_src[params::degree] + val; - } -} - -// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 -// if the extracted bit was 0 and 1 in the other case -// -// Remove the extracted bit from the state LWE to get a 0 at the extracted bit -// location. -// -// Shift on padding bit for next iteration, that's why -// alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used -// instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1) -template -__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe, - Torus *pbs_lwe_array_out, Torus add_value, - Torus mul_value) { - size_t tid = threadIdx.x; - size_t blockId = blockIdx.x; - auto cur_shifted_lwe = &shifted_lwe[blockId * (params::degree + 1)]; - auto cur_state_lwe = &state_lwe[blockId * (params::degree + 1)]; - auto cur_pbs_lwe_array_out = - &pbs_lwe_array_out[blockId * (params::degree + 1)]; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - cur_shifted_lwe[tid] = cur_state_lwe[tid] -= cur_pbs_lwe_array_out[tid]; - cur_shifted_lwe[tid] *= mul_value; - tid += params::degree / params::opt; - } - - if (threadIdx.x == params::degree / params::opt - 1) { - cur_shifted_lwe[params::degree] = cur_state_lwe[params::degree] -= - (cur_pbs_lwe_array_out[params::degree] + add_value); - cur_shifted_lwe[params::degree] *= mul_value; - } -} - -template -__host__ void host_extract_bits( - void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out, - Torus *lwe_array_in, Torus *lwe_array_in_buffer, - Torus *lwe_array_in_shifted_buffer, Torus *lwe_array_out_ks_buffer, - Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, - uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, - uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, - uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, - uint32_t max_shared_memory) { - - auto stream = static_cast(v_stream); - uint32_t ciphertext_n_bits = sizeof(Torus) * 8; - - int blocks = 1; - int threads = params::degree / params::opt; - - copy_and_shift_lwe<<>>( - lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in, - 1ll << (ciphertext_n_bits - delta_log - 1)); - checkCudaErrors(cudaGetLastError()); - - for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) { - cuda_keyswitch_lwe_ciphertext_vector( - v_stream, gpu_index, lwe_array_out_ks_buffer, - lwe_array_in_shifted_buffer, ksk, lwe_dimension_in, lwe_dimension_out, - base_log_ksk, level_count_ksk, 1); - - copy_small_lwe<<<1, 256, 0, *stream>>>( - list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1, - number_of_bits, number_of_bits - bit_idx - 1); - checkCudaErrors(cudaGetLastError()); - - if (bit_idx == number_of_bits - 1) { - break; - } - - add_to_body<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, - lwe_dimension_out, - 1ll << (ciphertext_n_bits - 2)); - checkCudaErrors(cudaGetLastError()); - - fill_lut_body_for_current_bit - <<>>( - lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx)); - checkCudaErrors(cudaGetLastError()); - - host_bootstrap_low_latency( - v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs, - lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, - lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk, - number_of_samples, 1, max_shared_memory); - - add_sub_and_mul_lwe<<<1, threads, 0, *stream>>>( - lwe_array_in_shifted_buffer, lwe_array_in_buffer, - lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx), - 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)); - checkCudaErrors(cudaGetLastError()); - } -} - /* * Receives "tau" GLWE ciphertexts as LUTs and "mbr_size" GGSWs. Each block * computes the blind rotation loop + sample extraction for a single LUT. @@ -780,180 +538,4 @@ void host_blind_rotate_and_sample_extraction( if (max_shared_memory < memory_needed_per_block) cuda_drop_async(d_mem, stream, gpu_index); } - -template -__host__ void host_circuit_bootstrap( - void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in, - double2 *fourier_bsk, Torus *fp_ksk_array, - Torus *lwe_array_in_shifted_buffer, Torus *lut_vector, - uint32_t *lut_vector_indexes, Torus *lwe_array_out_pbs_buffer, - Torus *lwe_array_in_fp_ks_buffer, uint32_t delta_log, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk, - uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, - uint32_t number_of_samples, uint32_t max_shared_memory) { - auto stream = static_cast(v_stream); - - uint32_t ciphertext_n_bits = sizeof(Torus) * 8; - uint32_t lwe_size = lwe_dimension + 1; - int pbs_count = number_of_samples * level_cbs; - - dim3 blocks(level_cbs, number_of_samples, 1); - int threads = 256; - - // Shift message LSB on padding bit, at this point we expect to have messages - // with only 1 bit of information - shift_lwe_cbs<<>>( - lwe_array_in_shifted_buffer, lwe_array_in, - 1LL << (ciphertext_n_bits - delta_log - 1), lwe_size); - - // Add q/4 to center the error while computing a negacyclic LUT - add_to_body - <<>>(lwe_array_in_shifted_buffer, lwe_dimension, - 1ll << (ciphertext_n_bits - 2)); - // Fill lut (equivalent to trivial encryption as mask is 0s) - // The LUT is filled with -alpha in each coefficient where - // alpha = 2^{log(q) - 1 - base_log * level} - fill_lut_body_for_cbs - <<>>( - lut_vector, ciphertext_n_bits, base_log_cbs); - - // Applying a negacyclic LUT on a ciphertext with one bit of message in the - // MSB and no bit of padding - host_bootstrap_amortized( - v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector, - lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, - lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count, - level_cbs, 0, max_shared_memory); - - dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1); - dim3 copy_block(params::degree / params::opt, 1, 1); - // Add q/4 to center the error while computing a negacyclic LUT - // copy pbs result (glwe_dimension + 1) times to be an input of fp-ks - copy_add_lwe_cbs<<>>( - lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits, - base_log_cbs, level_cbs); - - cuda_fp_keyswitch_lwe_to_glwe( - v_stream, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array, - polynomial_size, glwe_dimension, polynomial_size, base_log_pksk, - level_pksk, pbs_count * (glwe_dimension + 1), glwe_dimension + 1); -} - -// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP, -// i.e. tau * p where tau is the number of LUTs (the original number of LWEs -// before bit extraction) and p is the number of extracted bits -template -__host__ void host_circuit_bootstrap_vertical_packing( - void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, - Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs, - uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t lut_number, - uint32_t max_shared_memory) { - - auto stream = static_cast(v_stream); - - // allocate and initialize device pointers for circuit bootstrap - // output ggsw array for cbs - int ggsw_size = level_count_cbs * (glwe_dimension + 1) * - (glwe_dimension + 1) * polynomial_size; - Torus *ggsw_out = (Torus *)cuda_malloc_async( - number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index); - // input lwe array for fp-ks - Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async( - number_of_inputs * level_count_cbs * (glwe_dimension + 1) * - (polynomial_size + 1) * sizeof(Torus), - stream, gpu_index); - // buffer for pbs output - Torus *lwe_array_out_pbs_buffer = - (Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs * - (polynomial_size + 1) * sizeof(Torus), - stream, gpu_index); - // vector for shifted lwe input - Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async( - number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); - // lut vector buffer for cbs - Torus *lut_vector_cbs = (Torus *)cuda_malloc_async( - level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), - stream, gpu_index); - // indexes of lut vectors for cbs - uint32_t *h_lut_vector_indexes = - (uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t)); - for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) { - h_lut_vector_indexes[index] = index % level_count_cbs; - } - uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async( - number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); - cuda_memcpy_async_to_gpu( - lut_vector_indexes, h_lut_vector_indexes, - number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); - checkCudaErrors(cudaGetLastError()); - - uint32_t bits = sizeof(Torus) * 8; - uint32_t delta_log = (bits - 1); - - host_circuit_bootstrap( - v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk, - lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes, - lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, delta_log, - polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk, - base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs, - base_log_cbs, number_of_inputs, max_shared_memory); - checkCudaErrors(cudaGetLastError()); - - // Free memory - cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index); - cuda_drop_async(lut_vector_cbs, stream, gpu_index); - cuda_drop_async(lut_vector_indexes, stream, gpu_index); - free(h_lut_vector_indexes); - - // number_of_inputs = tau * p is the total number of GGSWs - if (number_of_inputs > params::log2_degree) { - // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the - // lsb GGSW is for the last blind rotation. - uint32_t r = number_of_inputs - params::log2_degree; - Torus *br_ggsw = (Torus *)ggsw_out + - (ptrdiff_t)(r * level_count_cbs * (glwe_dimension + 1) * - (glwe_dimension + 1) * polynomial_size); - Torus *glwe_array_out = (Torus *)cuda_malloc_async( - lut_number * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), - stream, gpu_index); - // CMUX Tree - // r = tau * p - log2(N) - host_cmux_tree( - v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, - glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs, r, - lut_number, max_shared_memory); - checkCudaErrors(cudaGetLastError()); - cuda_drop_async(glwe_array_out, stream, gpu_index); - - // Blind rotation + sample extraction - // mbr = tau * p - r = log2(N) - host_blind_rotate_and_sample_extraction( - v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out, - number_of_inputs - r, lut_number, glwe_dimension, polynomial_size, - base_log_cbs, level_count_cbs, max_shared_memory); - } else { - // we need to expand the lut to fill the masks with zeros - Torus *lut_vector_glwe = (Torus *)cuda_malloc_async( - lut_number * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), - stream, gpu_index); - add_padding_to_lut_async(lut_vector_glwe, lut_vector, - glwe_dimension, lut_number, stream); - checkCudaErrors(cudaGetLastError()); - - // Blind rotation + sample extraction - host_blind_rotate_and_sample_extraction( - v_stream, gpu_index, lwe_array_out, ggsw_out, lut_vector_glwe, - number_of_inputs, lut_number, glwe_dimension, polynomial_size, - base_log_cbs, level_count_cbs, max_shared_memory); - } - cuda_drop_async(ggsw_out, stream, gpu_index); -} - -#endif // WOP_PBS_H +#endif // VERTICAL_PACKING_H diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu new file mode 100644 index 000000000..57fcb15c7 --- /dev/null +++ b/src/wop_bootstrap.cu @@ -0,0 +1,168 @@ +#include "wop_bootstrap.cuh" + +void cuda_circuit_bootstrap_vertical_packing_64( + void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, + void *fourier_bsk, void *cbs_fpksk, void *lut_vector, + uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk, + uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, + uint32_t number_of_inputs, uint32_t lut_number, + uint32_t max_shared_memory) { + assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " + "512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // The number of inputs should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU extract bits): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + switch (polynomial_size) { + case 512: + host_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, + number_of_inputs, lut_number, max_shared_memory); + break; + case 1024: + host_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, + number_of_inputs, lut_number, max_shared_memory); + break; + case 2048: + host_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, + number_of_inputs, lut_number, max_shared_memory); + break; + case 4096: + host_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, + number_of_inputs, lut_number, max_shared_memory); + break; + case 8192: + host_circuit_bootstrap_vertical_packing>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, + number_of_inputs, lut_number, max_shared_memory); + break; + default: + break; + } +} + +void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, + void *lwe_array_in, void *lut_vector, void *fourier_bsk, + void *ksk, void *cbs_fpksk, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, + uint32_t base_log_pksk, uint32_t level_count_pksk, + uint32_t base_log_cbs, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, + uint32_t number_of_inputs, uint32_t max_shared_memory) { + assert(("Error (GPU WOP PBS): glwe_dimension should be equal to 1", + glwe_dimension == 1)); + assert(("Error (GPU WOP PBS): polynomial_size should be one of " + "512, 1024, 2048, 4096, 8192", + polynomial_size == 512 || polynomial_size == 1024 || + polynomial_size == 2048 || polynomial_size == 4096 || + polynomial_size == 8192)); + // The number of inputs should be lower than the number of streaming + // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related + // to the occupancy of 50%). The only supported value for k is 1, so + // k + 1 = 2 for now. + int number_of_sm = 0; + cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); + assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or " + "equal to the " + "number of streaming multiprocessors on the device divided by 8 * " + "level_count_bsk", + number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk)); + switch (polynomial_size) { + case 512: + host_wop_pbs>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, + glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, + number_of_bits_of_message_including_padding, number_of_bits_to_extract, + number_of_inputs, max_shared_memory); + break; + case 1024: + host_wop_pbs>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, + glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, + number_of_bits_of_message_including_padding, number_of_bits_to_extract, + number_of_inputs, max_shared_memory); + break; + case 2048: + host_wop_pbs>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, + glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, + number_of_bits_of_message_including_padding, number_of_bits_to_extract, + number_of_inputs, max_shared_memory); + break; + case 4096: + host_wop_pbs>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, + glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, + number_of_bits_of_message_including_padding, number_of_bits_to_extract, + number_of_inputs, max_shared_memory); + break; + case 8192: + host_wop_pbs>( + v_stream, gpu_index, (uint64_t *)lwe_array_out, + (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, + (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, + glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, + level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, + number_of_bits_of_message_including_padding, number_of_bits_to_extract, + number_of_inputs, max_shared_memory); + break; + default: + break; + } +} diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh new file mode 100644 index 000000000..fcde83007 --- /dev/null +++ b/src/wop_bootstrap.cuh @@ -0,0 +1,212 @@ +#ifndef WOP_PBS_H +#define WOP_PBS_H + +#include "cooperative_groups.h" + +#include "../include/helper_cuda.h" +#include "bit_extraction.cuh" +#include "bootstrap.h" +#include "circuit_bootstrap.cuh" +#include "utils/kernel_dimensions.cuh" +#include "utils/memory.cuh" +#include "utils/timer.cuh" +#include "vertical_packing.cuh" + +template +__global__ void device_build_lut(Torus *lut_out, Torus *lut_in, + uint32_t glwe_dimension, uint32_t lut_number) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < glwe_dimension * params::degree * lut_number) { + int lut_index = index / (glwe_dimension * params::degree); + for (int j = 0; j < glwe_dimension; j++) { + lut_out[index + lut_index * (glwe_dimension + 1) * params::degree + + j * params::degree] = 0; + } + lut_out[index + lut_index * (glwe_dimension + 1) * params::degree + + glwe_dimension * params::degree] = lut_in[index]; + } +} + +// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP, +// i.e. tau * p where tau is the number of LUTs (the original number of LWEs +// before bit extraction) and p is the number of extracted bits +template +__host__ void host_circuit_bootstrap_vertical_packing( + void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, + Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs, + uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t lut_number, + uint32_t max_shared_memory) { + + auto stream = static_cast(v_stream); + + // allocate and initialize device pointers for circuit bootstrap + // output ggsw array for cbs + int ggsw_size = level_count_cbs * (glwe_dimension + 1) * + (glwe_dimension + 1) * polynomial_size; + Torus *ggsw_out = (Torus *)cuda_malloc_async( + number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index); + // input lwe array for fp-ks + Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async( + number_of_inputs * level_count_cbs * (glwe_dimension + 1) * + (polynomial_size + 1) * sizeof(Torus), + stream, gpu_index); + // buffer for pbs output + Torus *lwe_array_out_pbs_buffer = + (Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs * + (polynomial_size + 1) * sizeof(Torus), + stream, gpu_index); + // vector for shifted lwe input + Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async( + number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus), + stream, gpu_index); + // lut vector buffer for cbs + Torus *lut_vector_cbs = (Torus *)cuda_malloc_async( + level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), + stream, gpu_index); + // indexes of lut vectors for cbs + uint32_t *h_lut_vector_indexes = + (uint32_t *)malloc(number_of_inputs * level_count_cbs * sizeof(uint32_t)); + for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) { + h_lut_vector_indexes[index] = index % level_count_cbs; + } + uint32_t *lut_vector_indexes = (uint32_t *)cuda_malloc_async( + number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); + cuda_memcpy_async_to_gpu( + lut_vector_indexes, h_lut_vector_indexes, + number_of_inputs * level_count_cbs * sizeof(uint32_t), stream, gpu_index); + checkCudaErrors(cudaGetLastError()); + + uint32_t bits = sizeof(Torus) * 8; + uint32_t delta_log = (bits - 1); + + host_circuit_bootstrap( + v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk, + lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes, + lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, delta_log, + polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk, + base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs, + base_log_cbs, number_of_inputs, max_shared_memory); + checkCudaErrors(cudaGetLastError()); + + // Free memory + cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index); + cuda_drop_async(lut_vector_cbs, stream, gpu_index); + cuda_drop_async(lut_vector_indexes, stream, gpu_index); + free(h_lut_vector_indexes); + + // number_of_inputs = tau * p is the total number of GGSWs + if (number_of_inputs > params::log2_degree) { + // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the + // lsb GGSW is for the last blind rotation. + uint32_t r = number_of_inputs - params::log2_degree; + Torus *br_ggsw = (Torus *)ggsw_out + + (ptrdiff_t)(r * level_count_cbs * (glwe_dimension + 1) * + (glwe_dimension + 1) * polynomial_size); + Torus *glwe_array_out = (Torus *)cuda_malloc_async( + lut_number * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), + stream, gpu_index); + // CMUX Tree + // r = tau * p - log2(N) + host_cmux_tree( + v_stream, gpu_index, glwe_array_out, ggsw_out, lut_vector, + glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs, r, + lut_number, max_shared_memory); + checkCudaErrors(cudaGetLastError()); + cuda_drop_async(glwe_array_out, stream, gpu_index); + + // Blind rotation + sample extraction + // mbr = tau * p - r = log2(N) + host_blind_rotate_and_sample_extraction( + v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out, + number_of_inputs - r, lut_number, glwe_dimension, polynomial_size, + base_log_cbs, level_count_cbs, max_shared_memory); + } else { + // we need to expand the lut to fill the masks with zeros + Torus *lut_vector_glwe = (Torus *)cuda_malloc_async( + lut_number * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), + stream, gpu_index); + add_padding_to_lut_async(lut_vector_glwe, lut_vector, + glwe_dimension, lut_number, stream); + checkCudaErrors(cudaGetLastError()); + + // Blind rotation + sample extraction + host_blind_rotate_and_sample_extraction( + v_stream, gpu_index, lwe_array_out, ggsw_out, lut_vector_glwe, + number_of_inputs, lut_number, glwe_dimension, polynomial_size, + base_log_cbs, level_count_cbs, max_shared_memory); + } + cuda_drop_async(ggsw_out, stream, gpu_index); +} + +template +__host__ void host_wop_pbs( + void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, Torus *ksk, + Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t base_log_pksk, + uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, + uint32_t max_shared_memory) { + + auto stream = static_cast(v_stream); + + // let mut h_lut_vector_indexes = vec![0 as u32; 1]; + // indexes of lut vectors for bit extract + uint32_t *h_lut_vector_indexes = (uint32_t *)malloc(sizeof(uint32_t)); + h_lut_vector_indexes[0] = 0; + uint32_t *lut_vector_indexes = + (uint32_t *)cuda_malloc_async(sizeof(uint32_t), stream, gpu_index); + cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes, + sizeof(uint32_t), stream, gpu_index); + checkCudaErrors(cudaGetLastError()); + Torus *lut_pbs = (Torus *)cuda_malloc_async( + (2 * polynomial_size) * sizeof(Torus), stream, gpu_index); + Torus *lwe_array_in_buffer = (Torus *)cuda_malloc_async( + (polynomial_size + 1) * sizeof(Torus), stream, gpu_index); + Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async( + (polynomial_size + 1) * sizeof(Torus), stream, gpu_index); + Torus *lwe_array_out_ks_buffer = (Torus *)cuda_malloc_async( + (lwe_dimension + 1) * sizeof(Torus), stream, gpu_index); + Torus *lwe_array_out_pbs_buffer = (Torus *)cuda_malloc_async( + (polynomial_size + 1) * sizeof(Torus), stream, gpu_index); + Torus *lwe_array_out_bit_extract = (Torus *)cuda_malloc_async( + (lwe_dimension + 1) * (number_of_bits_of_message_including_padding) * + sizeof(Torus), + stream, gpu_index); + uint32_t ciphertext_n_bits = sizeof(Torus) * 8; + uint32_t delta_log = + ciphertext_n_bits - number_of_bits_of_message_including_padding; + host_extract_bits( + v_stream, gpu_index, lwe_array_out_bit_extract, lwe_array_in, + lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer, + lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk, + number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension, + base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, + number_of_inputs, max_shared_memory); + checkCudaErrors(cudaGetLastError()); + cuda_drop_async(lut_pbs, stream, gpu_index); + cuda_drop_async(lut_vector_indexes, stream, gpu_index); + cuda_drop_async(lwe_array_in_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_out_ks_buffer, stream, gpu_index); + cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index); + + host_circuit_bootstrap_vertical_packing( + v_stream, gpu_index, lwe_array_out, lwe_array_out_bit_extract, lut_vector, + fourier_bsk, cbs_fpksk, glwe_dimension, lwe_dimension, polynomial_size, + base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, + number_of_inputs * number_of_bits_to_extract, number_of_inputs, + max_shared_memory); + + checkCudaErrors(cudaGetLastError()); + cuda_drop_async(lwe_array_out_bit_extract, stream, gpu_index); +} +#endif // WOP_PBS_H