diff --git a/include/bootstrap.h b/include/bootstrap.h index 5b1599509..f56169036 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -105,25 +105,66 @@ void cuda_circuit_bootstrap_64( uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_samples, uint32_t max_shared_memory); +void scratch_cuda_circuit_bootstrap_vertical_packing_32( + void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer, + uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory); + +void scratch_cuda_circuit_bootstrap_vertical_packing_64( + void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer, + uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory); + +void scratch_cuda_wop_pbs_32( + void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer, + uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, uint32_t number_of_inputs); + +void scratch_cuda_wop_pbs_64( + void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer, + uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, uint32_t number_of_inputs); + void cuda_circuit_bootstrap_vertical_packing_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *fourier_bsk, void *cbs_fpksk, void *lut_vector, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk, - uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, - uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory); + void *fourier_bsk, void *cbs_fpksk, void *lut_vector, void *cbs_vp_buffer, + uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk, + uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs, + uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, + uint32_t max_shared_memory); void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *lut_vector, void *fourier_bsk, - void *ksk, void *cbs_fpksk, uint32_t glwe_dimension, + void *ksk, void *cbs_fpksk, void *wop_pbs_buffer, + uint32_t cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs, uint32_t number_of_bits_of_message_including_padding, - uint32_t number_of_bits_to_extract, + uint32_t number_of_bits_to_extract, uint32_t delta_log, uint32_t number_of_inputs, uint32_t max_shared_memory); + +void cleanup_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index, + void **wop_pbs_buffer); +void cleanup_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, + void **wop_pbs_buffer); + +void cleanup_cuda_circuit_bootstrap_vertical_packing_32(void *v_stream, + uint32_t gpu_index, + void **cbs_vp_buffer); + +void cleanup_cuda_circuit_bootstrap_vertical_packing_64(void *v_stream, + uint32_t gpu_index, + void **cbs_vp_buffer); } #ifdef __CUDACC__ diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh index 3ea5b3216..5a91f35c9 100644 --- a/src/bit_extraction.cuh +++ b/src/bit_extraction.cuh @@ -137,12 +137,12 @@ __host__ void host_extract_bits( void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out, Torus *lwe_array_in, Torus *lwe_array_in_buffer, Torus *lwe_array_in_shifted_buffer, Torus *lwe_array_out_ks_buffer, - Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, - Torus *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk, - uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in, - uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk, - uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, - uint32_t number_of_samples, uint32_t max_shared_memory) { + Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, Torus *lut_vector_indexes, + Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits, + uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, + uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, + uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, + uint32_t max_shared_memory) { cudaSetDevice(gpu_index); auto stream = static_cast(v_stream); diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 5b0043944..dc9dfd7e5 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -248,9 +248,9 @@ __global__ void device_bootstrap_low_latency( template __host__ void host_bootstrap_low_latency( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors, uint32_t max_shared_memory) { diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu index 4a71beadb..f9afc6fed 100644 --- a/src/wop_bootstrap.cu +++ b/src/wop_bootstrap.cu @@ -1,5 +1,79 @@ #include "wop_bootstrap.cuh" +/* + * This scratch function allocates the necessary amount of data on the GPU for + * the circuit bootstrap and vertical packing on 32 bits inputs, into + * `cbs_vp_buffer`. It also fills the value of delta_log to be used in the + * circuit bootstrap. + */ +void scratch_cuda_circuit_bootstrap_vertical_packing_32( + void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer, + uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory) { + + scratch_circuit_bootstrap_vertical_packing( + v_stream, gpu_index, (uint32_t **)cbs_vp_buffer, cbs_delta_log, + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + number_of_inputs, tau, allocate_gpu_memory); +} + +/* + * This scratch function allocates the necessary amount of data on the GPU for + * the circuit bootstrap and vertical packing on 64 bits inputs, into + * `cbs_vp_buffer`. It also fills the value of delta_log to be used in the + * circuit bootstrap. + */ +void scratch_cuda_circuit_bootstrap_vertical_packing_64( + void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer, + uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory) { + + scratch_circuit_bootstrap_vertical_packing( + v_stream, gpu_index, (uint64_t **)cbs_vp_buffer, cbs_delta_log, + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + number_of_inputs, tau, allocate_gpu_memory); +} + +/* + * This scratch function allocates the necessary amount of data on the GPU for + * the wop PBS on 32 bits inputs, into `wop_pbs_buffer`. It also fills the value + * of delta_log and cbs_delta_log to be used in the bit extract and circuit + * bootstrap. + */ +void scratch_cuda_wop_pbs_32( + void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer, + uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, uint32_t number_of_inputs) { + scratch_wop_pbs(v_stream, gpu_index, (uint32_t **)wop_pbs_buffer, + delta_log, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, level_count_cbs, + number_of_bits_of_message_including_padding, + number_of_bits_to_extract, number_of_inputs); +} + +/* + * This scratch function allocates the necessary amount of data on the GPU for + * the wop PBS on 64 bits inputs, into `wop_pbs_buffer`. It also fills the value + * of delta_log and cbs_delta_log to be used in the bit extract and circuit + * bootstrap. + */ +void scratch_cuda_wop_pbs_64( + void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer, + uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, uint32_t number_of_inputs) { + scratch_wop_pbs(v_stream, gpu_index, (uint64_t **)wop_pbs_buffer, + delta_log, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, level_count_cbs, + number_of_bits_of_message_including_padding, + number_of_bits_to_extract, number_of_inputs); +} + /* * Entry point for cuda circuit bootstrap + vertical packing for batches of * input 64 bit LWE ciphertexts. @@ -12,6 +86,7 @@ * compressed complex key. * - 'cbs_fpksk' list of private functional packing keyswitch keys * - 'lut_vector' list of test vectors + * - 'cbs_vp_buffer' a pre-allocated array to store intermediate results * - 'polynomial_size' size of the test polynomial, supported sizes: * {512, 1024, 2048, 4096, 8192} * - 'glwe_dimension' supported dimensions: {1} @@ -29,11 +104,11 @@ */ void cuda_circuit_bootstrap_vertical_packing_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, - void *fourier_bsk, void *cbs_fpksk, void *lut_vector, - uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk, - uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs, - uint32_t number_of_inputs, uint32_t lut_number, + void *fourier_bsk, void *cbs_fpksk, void *lut_vector, void *cbs_vp_buffer, + uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk, + uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs, + uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory) { assert(("Error (GPU circuit bootstrap): polynomial_size should be one of " "512, 1024, 2048, 4096, 8192", @@ -56,46 +131,51 @@ void cuda_circuit_bootstrap_vertical_packing_64( host_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, + (uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension, + polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs, + lut_number, max_shared_memory); break; case 1024: host_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, + (uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension, + polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs, + lut_number, max_shared_memory); break; case 2048: host_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, + (uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension, + polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs, + lut_number, max_shared_memory); break; case 4096: host_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, + (uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension, + polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs, + lut_number, max_shared_memory); break; case 8192: host_circuit_bootstrap_vertical_packing>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, - (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension, - lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, - base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, - number_of_inputs, lut_number, max_shared_memory); + (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, + (uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension, + polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk, + level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs, + lut_number, max_shared_memory); break; default: break; @@ -115,6 +195,7 @@ void cuda_circuit_bootstrap_vertical_packing_64( * compressed complex key. * - 'ksk' keyswitch key to use inside extract bits block * - 'cbs_fpksk' list of fp-keyswitch keys + * - 'wop_pbs_buffer' a pre-allocated array to store intermediate results * - 'glwe_dimension' supported dimensions: {1} * - 'lwe_dimension' dimension of input lwe ciphertexts * - 'polynomial_size' size of the test polynomial, supported sizes: @@ -138,14 +219,15 @@ void cuda_circuit_bootstrap_vertical_packing_64( */ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *lut_vector, void *fourier_bsk, - void *ksk, void *cbs_fpksk, uint32_t glwe_dimension, + void *ksk, void *cbs_fpksk, void *wop_pbs_buffer, + uint32_t cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs, uint32_t number_of_bits_of_message_including_padding, - uint32_t number_of_bits_to_extract, + uint32_t number_of_bits_to_extract, uint32_t delta_log, uint32_t number_of_inputs, uint32_t max_shared_memory) { assert(("Error (GPU WOP PBS): polynomial_size should be one of " "512, 1024, 2048, 4096, 8192", @@ -169,57 +251,100 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, - glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, - level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, - level_count_pksk, base_log_cbs, level_count_cbs, + (uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, number_of_bits_of_message_including_padding, number_of_bits_to_extract, - number_of_inputs, max_shared_memory); + delta_log, number_of_inputs, max_shared_memory); break; case 1024: host_wop_pbs>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, - glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, - level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, - level_count_pksk, base_log_cbs, level_count_cbs, + (uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, number_of_bits_of_message_including_padding, number_of_bits_to_extract, - number_of_inputs, max_shared_memory); + delta_log, number_of_inputs, max_shared_memory); break; case 2048: host_wop_pbs>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, - glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, - level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, - level_count_pksk, base_log_cbs, level_count_cbs, + (uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, number_of_bits_of_message_including_padding, number_of_bits_to_extract, - number_of_inputs, max_shared_memory); + delta_log, number_of_inputs, max_shared_memory); break; case 4096: host_wop_pbs>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, - glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, - level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, - level_count_pksk, base_log_cbs, level_count_cbs, + (uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, number_of_bits_of_message_including_padding, number_of_bits_to_extract, - number_of_inputs, max_shared_memory); + delta_log, number_of_inputs, max_shared_memory); break; case 8192: host_wop_pbs>( v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lwe_array_in, (uint64_t *)lut_vector, (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk, - glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, - level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk, - level_count_pksk, base_log_cbs, level_count_cbs, + (uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk, + base_log_cbs, level_count_cbs, number_of_bits_of_message_including_padding, number_of_bits_to_extract, - number_of_inputs, max_shared_memory); + delta_log, number_of_inputs, max_shared_memory); break; default: break; } } + +/* + * This cleanup function frees the data for the wop PBS on GPU in wop_pbs_buffer + * for 32 bits inputs. + */ +void cleanup_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index, + void **wop_pbs_buffer) { + cleanup_wop_pbs(v_stream, gpu_index, (uint32_t **)wop_pbs_buffer); +} +/* + * This cleanup function frees the data for the wop PBS on GPU in wop_pbs_buffer + * for 64 bits inputs. + */ +void cleanup_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, + void **wop_pbs_buffer) { + cleanup_wop_pbs(v_stream, gpu_index, (uint64_t **)wop_pbs_buffer); +} + +/* + * This cleanup function frees the data for the circuit bootstrap and vertical + * packing on GPU in cbs_vp_buffer for 32 bits inputs. + */ +void cleanup_cuda_circuit_bootstrap_vertical_packing_32(void *v_stream, + uint32_t gpu_index, + void **cbs_vp_buffer) { + cleanup_circuit_bootstrap_vertical_packing( + v_stream, gpu_index, (uint32_t **)cbs_vp_buffer); +} +/* + * This cleanup function frees the data for the circuit bootstrap and vertical + * packing on GPU in cbs_vp_buffer for 64 bits inputs. + */ +void cleanup_cuda_circuit_bootstrap_vertical_packing_64(void *v_stream, + uint32_t gpu_index, + void **cbs_vp_buffer) { + cleanup_circuit_bootstrap_vertical_packing( + v_stream, gpu_index, (uint64_t **)cbs_vp_buffer); +} diff --git a/src/wop_bootstrap.cuh b/src/wop_bootstrap.cuh index 3bf7834c6..ba1e6ddad 100644 --- a/src/wop_bootstrap.cuh +++ b/src/wop_bootstrap.cuh @@ -26,6 +26,88 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in, } } +template +__host__ __device__ int +get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau) { + + int ggsw_size = level_count_cbs * (glwe_dimension + 1) * + (glwe_dimension + 1) * polynomial_size; + return number_of_inputs * level_count_cbs * + sizeof(Torus) // lut_vector_indexes + + number_of_inputs * ggsw_size * sizeof(Torus) // ggsw_out + + + number_of_inputs * level_count_cbs * (glwe_dimension + 1) * + (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_fp_ks_buffer + + number_of_inputs * level_count_cbs * (polynomial_size + 1) * + sizeof(Torus) // lwe_array_out_pbs_buffer + + number_of_inputs * level_count_cbs * (lwe_dimension + 1) * + sizeof(Torus) // lwe_array_in_shifted_buffer + + level_count_cbs * (glwe_dimension + 1) * polynomial_size * + sizeof(Torus) // lut_vector_cbs + + tau * (glwe_dimension + 1) * polynomial_size * + sizeof(Torus); // glwe_array_out +} + +template +__host__ void scratch_circuit_bootstrap_vertical_packing( + void *v_stream, uint32_t gpu_index, Torus **cbs_vp_buffer, + uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory) { + + cudaSetDevice(gpu_index); + auto stream = static_cast(v_stream); + + // Allocate lut vector indexes on the CPU first to avoid blocking the stream + Torus *h_lut_vector_indexes = + (Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus)); + // allocate and initialize device pointers for circuit bootstrap and vertical + // packing + if (allocate_gpu_memory) { + int buffer_size = get_buffer_size_cbs_vp( + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + number_of_inputs, tau); + *cbs_vp_buffer = (Torus *)cuda_malloc_async(buffer_size, stream, gpu_index); + } + // indexes of lut vectors for cbs + for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) { + h_lut_vector_indexes[index] = index % level_count_cbs; + } + // lut_vector_indexes is the first buffer in the cbs_vp_buffer + cuda_memcpy_async_to_gpu(*cbs_vp_buffer, h_lut_vector_indexes, + number_of_inputs * level_count_cbs * sizeof(Torus), + stream, gpu_index); + check_cuda_error(cudaStreamSynchronize(*stream)); + free(h_lut_vector_indexes); + check_cuda_error(cudaGetLastError()); + + uint32_t bits = sizeof(Torus) * 8; + *cbs_delta_log = (bits - 1); +} + +/* + * Cleanup functions free the necessary data on the GPU and on the CPU. + * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus + * frees the data for the circuit bootstrap and vertical packing on GPU: + * - ggsw_out + * - lwe_array_in_fp_ks_buffer + * - lwe_array_out_pbs_buffer + * - lwe_array_in_shifted buffer + * - lut_vector_cbs + * - lut_vector_indexes + */ +template +__host__ void +cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index, + Torus **cbs_vp_buffer) { + + auto stream = static_cast(v_stream); + // Free memory + cuda_drop_async(*cbs_vp_buffer, stream, gpu_index); +} + // number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP, // i.e. tau * p where tau is the number of LUTs (the original number of LWEs // before bit extraction) and p is the number of extracted bits @@ -33,79 +115,46 @@ template __host__ void host_circuit_bootstrap_vertical_packing( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, - Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs, - uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t tau, - uint32_t max_shared_memory) { + Torus *cbs_fpksk, Torus *cbs_vp_buffer, uint32_t cbs_delta_log, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_pksk, + uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs, + uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory) { - cudaSetDevice(gpu_index); - auto stream = static_cast(v_stream); - - // allocate and initialize device pointers for circuit bootstrap - // output ggsw array for cbs int ggsw_size = level_count_cbs * (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size; - Torus *ggsw_out = (Torus *)cuda_malloc_async( - number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index); - // input lwe array for fp-ks - Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async( - number_of_inputs * level_count_cbs * (glwe_dimension + 1) * - (polynomial_size + 1) * sizeof(Torus), - stream, gpu_index); - // buffer for pbs output + Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer; + Torus *ggsw_out = (Torus *)lut_vector_indexes + + (ptrdiff_t)(number_of_inputs * level_count_cbs); + Torus *lwe_array_in_fp_ks_buffer = + (Torus *)ggsw_out + (ptrdiff_t)(number_of_inputs * ggsw_size); Torus *lwe_array_out_pbs_buffer = - (Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs * - (polynomial_size + 1) * sizeof(Torus), - stream, gpu_index); - // vector for shifted lwe input - Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async( - number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus), - stream, gpu_index); - // lut vector buffer for cbs - Torus *lut_vector_cbs = (Torus *)cuda_malloc_async( - level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), - stream, gpu_index); - // indexes of lut vectors for cbs - Torus *h_lut_vector_indexes = - (Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus)); - for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) { - h_lut_vector_indexes[index] = index % level_count_cbs; - } - Torus *lut_vector_indexes = (Torus *)cuda_malloc_async( - number_of_inputs * level_count_cbs * sizeof(Torus), stream, gpu_index); - cuda_memcpy_async_to_gpu( - lut_vector_indexes, h_lut_vector_indexes, - number_of_inputs * level_count_cbs * sizeof(Torus), stream, gpu_index); - check_cuda_error(cudaGetLastError()); - - uint32_t bits = sizeof(Torus) * 8; - uint32_t delta_log = (bits - 1); + (Torus *)lwe_array_in_fp_ks_buffer + + (ptrdiff_t)(number_of_inputs * level_count_cbs * (glwe_dimension + 1) * + (polynomial_size + 1)); + Torus *lwe_array_in_shifted_buffer = + (Torus *)lwe_array_out_pbs_buffer + + (ptrdiff_t)(number_of_inputs * level_count_cbs * (polynomial_size + 1)); + Torus *lut_vector_cbs = + (Torus *)lwe_array_in_shifted_buffer + + (ptrdiff_t)(number_of_inputs * level_count_cbs * (lwe_dimension + 1)); + Torus *glwe_array_out = + (Torus *)lut_vector_cbs + + (ptrdiff_t)(level_count_cbs * (glwe_dimension + 1) * polynomial_size); host_circuit_bootstrap( v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk, lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes, - lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, delta_log, + lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, cbs_delta_log, polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs, number_of_inputs, max_shared_memory); check_cuda_error(cudaGetLastError()); - // Free memory - cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index); - cuda_drop_async(lut_vector_cbs, stream, gpu_index); - cuda_drop_async(lut_vector_indexes, stream, gpu_index); - free(h_lut_vector_indexes); - // number_of_inputs = tau * p is the total number of GGSWs // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the // lsb GGSW is for the last blind rotation. uint32_t r = number_of_inputs - params::log2_degree; - Torus *glwe_array_out = (Torus *)cuda_malloc_async( - tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, - gpu_index); // CMUX Tree // r = tau * p - log2(N) host_cmux_tree( @@ -123,52 +172,109 @@ __host__ void host_circuit_bootstrap_vertical_packing( v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out, number_of_inputs - r, tau, glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs, max_shared_memory); +} - cuda_drop_async(glwe_array_out, stream, gpu_index); - cuda_drop_async(ggsw_out, stream, gpu_index); +template +__host__ __device__ int +get_buffer_size_wop_pbs(uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, + uint32_t number_of_inputs) { + + return sizeof(Torus) // lut_vector_indexes + + ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs + + (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_buffer + + (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_shifted_buffer + + (lwe_dimension + 1) * sizeof(Torus) // lwe_array_out_ks_buffer + + (polynomial_size + 1) * sizeof(Torus) // lwe_array_out_pbs_buffer + + (lwe_dimension + 1) * // lwe_array_out_bit_extract + (number_of_bits_of_message_including_padding) * sizeof(Torus); +} + +template +__host__ void +scratch_wop_pbs(void *v_stream, uint32_t gpu_index, Torus **wop_pbs_buffer, + uint32_t *delta_log, uint32_t *cbs_delta_log, + uint32_t glwe_dimension, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t level_count_cbs, + uint32_t number_of_bits_of_message_including_padding, + uint32_t number_of_bits_to_extract, uint32_t number_of_inputs) { + + cudaSetDevice(gpu_index); + auto stream = static_cast(v_stream); + + int wop_pbs_buffer_size = get_buffer_size_wop_pbs( + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + number_of_bits_of_message_including_padding, number_of_bits_to_extract, + number_of_inputs); + int buffer_size = + get_buffer_size_cbs_vp( + glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs, + number_of_inputs * number_of_bits_to_extract, number_of_inputs) + + wop_pbs_buffer_size; + + *wop_pbs_buffer = (Torus *)cuda_malloc_async(buffer_size, stream, gpu_index); + + // indexes of lut vectors for bit extract + Torus h_lut_vector_indexes = 0; + // lut_vector_indexes is the first array in the wop_pbs buffer + cuda_memcpy_async_to_gpu(*wop_pbs_buffer, &h_lut_vector_indexes, + sizeof(Torus), stream, gpu_index); + check_cuda_error(cudaGetLastError()); + uint32_t ciphertext_total_bits_count = sizeof(Torus) * 8; + *delta_log = + ciphertext_total_bits_count - number_of_bits_of_message_including_padding; + Torus *cbs_vp_buffer = + *wop_pbs_buffer + + (ptrdiff_t)( + 1 + ((glwe_dimension + 1) * polynomial_size) + (polynomial_size + 1) + + (polynomial_size + 1) + (lwe_dimension + 1) + (polynomial_size + 1) + + (lwe_dimension + 1) * (number_of_bits_of_message_including_padding)); + scratch_circuit_bootstrap_vertical_packing( + v_stream, gpu_index, &cbs_vp_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, level_count_cbs, + number_of_inputs * number_of_bits_to_extract, number_of_inputs, false); +} + +/* + * Cleanup functions free the necessary data on the GPU and on the CPU. + * Data that lives on the CPU is prefixed with `h_`. This cleanup function thus + * frees the data for the wop PBS on GPU in wop_pbs_buffer + */ +template +__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index, + Torus **wop_pbs_buffer) { + auto stream = static_cast(v_stream); + cuda_drop_async(*wop_pbs_buffer, stream, gpu_index); } template __host__ void host_wop_pbs( void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, Torus *ksk, - Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension, - uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk, - uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t base_log_pksk, - uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs, + Torus *cbs_fpksk, Torus *wop_pbs_buffer, uint32_t cbs_delta_log, + uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, + uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk, + uint32_t level_count_ksk, uint32_t base_log_pksk, uint32_t level_count_pksk, + uint32_t base_log_cbs, uint32_t level_count_cbs, uint32_t number_of_bits_of_message_including_padding, - uint32_t number_of_bits_to_extract, uint32_t number_of_inputs, - uint32_t max_shared_memory) { + uint32_t number_of_bits_to_extract, uint32_t delta_log, + uint32_t number_of_inputs, uint32_t max_shared_memory) { - cudaSetDevice(gpu_index); - auto stream = static_cast(v_stream); - - // let mut h_lut_vector_indexes = vec![0 as u32; 1]; - // indexes of lut vectors for bit extract - Torus *h_lut_vector_indexes = (Torus *)malloc(sizeof(Torus)); - h_lut_vector_indexes[0] = 0; - Torus *lut_vector_indexes = - (Torus *)cuda_malloc_async(sizeof(Torus), stream, gpu_index); - cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes, - sizeof(Torus), stream, gpu_index); - check_cuda_error(cudaGetLastError()); - Torus *lut_pbs = (Torus *)cuda_malloc_async( - (2 * polynomial_size) * sizeof(Torus), stream, gpu_index); - Torus *lwe_array_in_buffer = (Torus *)cuda_malloc_async( - (polynomial_size + 1) * sizeof(Torus), stream, gpu_index); - Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async( - (polynomial_size + 1) * sizeof(Torus), stream, gpu_index); - Torus *lwe_array_out_ks_buffer = (Torus *)cuda_malloc_async( - (lwe_dimension + 1) * sizeof(Torus), stream, gpu_index); - Torus *lwe_array_out_pbs_buffer = (Torus *)cuda_malloc_async( - (polynomial_size + 1) * sizeof(Torus), stream, gpu_index); - Torus *lwe_array_out_bit_extract = (Torus *)cuda_malloc_async( - (lwe_dimension + 1) * (number_of_bits_of_message_including_padding) * - sizeof(Torus), - stream, gpu_index); - uint32_t ciphertext_n_bits = sizeof(Torus) * 8; - uint32_t delta_log = - ciphertext_n_bits - number_of_bits_of_message_including_padding; + // lut_vector_indexes is the first array in the wop_pbs buffer + Torus *lut_vector_indexes = (Torus *)wop_pbs_buffer; + Torus *lut_pbs = (Torus *)lut_vector_indexes + (ptrdiff_t)(1); + Torus *lwe_array_in_buffer = + (Torus *)lut_pbs + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size); + Torus *lwe_array_in_shifted_buffer = + (Torus *)lwe_array_in_buffer + (ptrdiff_t)(polynomial_size + 1); + Torus *lwe_array_out_ks_buffer = + (Torus *)lwe_array_in_shifted_buffer + (ptrdiff_t)(polynomial_size + 1); + Torus *lwe_array_out_pbs_buffer = + (Torus *)lwe_array_out_ks_buffer + (ptrdiff_t)(lwe_dimension + 1); + Torus *lwe_array_out_bit_extract = + (Torus *)lwe_array_out_pbs_buffer + (ptrdiff_t)(polynomial_size + 1); host_extract_bits( v_stream, gpu_index, lwe_array_out_bit_extract, lwe_array_in, lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer, @@ -177,22 +283,20 @@ __host__ void host_wop_pbs( glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk, number_of_inputs, max_shared_memory); check_cuda_error(cudaGetLastError()); - cuda_drop_async(lut_pbs, stream, gpu_index); - cuda_drop_async(lut_vector_indexes, stream, gpu_index); - cuda_drop_async(lwe_array_in_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_out_ks_buffer, stream, gpu_index); - cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index); + Torus *cbs_vp_buffer = + (Torus *)wop_pbs_buffer + + (ptrdiff_t)( + 1 + ((glwe_dimension + 1) * polynomial_size) + (polynomial_size + 1) + + (polynomial_size + 1) + (lwe_dimension + 1) + (polynomial_size + 1) + + (lwe_dimension + 1) * number_of_bits_of_message_including_padding); host_circuit_bootstrap_vertical_packing( v_stream, gpu_index, lwe_array_out, lwe_array_out_bit_extract, lut_vector, - fourier_bsk, cbs_fpksk, glwe_dimension, lwe_dimension, polynomial_size, - base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk, - base_log_cbs, level_count_cbs, + fourier_bsk, cbs_fpksk, cbs_vp_buffer, cbs_delta_log, glwe_dimension, + lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk, + base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs * number_of_bits_to_extract, number_of_inputs, max_shared_memory); - check_cuda_error(cudaGetLastError()); - cuda_drop_async(lwe_array_out_bit_extract, stream, gpu_index); } #endif // WOP_PBS_H