refactor(cuda): create scratch function and cleanup for wop pbs

This commit is contained in:
Agnes Leroy
2023-01-24 12:11:33 +01:00
committed by Agnès Leroy
parent e9243bce6f
commit 730274f156
5 changed files with 437 additions and 167 deletions

View File

@@ -105,25 +105,66 @@ void cuda_circuit_bootstrap_64(
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory);
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory);
void scratch_cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory);
void scratch_cuda_wop_pbs_32(
void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer,
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs);
void scratch_cuda_wop_pbs_64(
void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer,
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs);
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk,
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number, uint32_t max_shared_memory);
void *fourier_bsk, void *cbs_fpksk, void *lut_vector, void *cbs_vp_buffer,
uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk,
uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory);
void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in, void *lut_vector, void *fourier_bsk,
void *ksk, void *cbs_fpksk, uint32_t glwe_dimension,
void *ksk, void *cbs_fpksk, void *wop_pbs_buffer,
uint32_t cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t base_log_pksk, uint32_t level_count_pksk,
uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract,
uint32_t number_of_bits_to_extract, uint32_t delta_log,
uint32_t number_of_inputs, uint32_t max_shared_memory);
void cleanup_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index,
void **wop_pbs_buffer);
void cleanup_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index,
void **wop_pbs_buffer);
void cleanup_cuda_circuit_bootstrap_vertical_packing_32(void *v_stream,
uint32_t gpu_index,
void **cbs_vp_buffer);
void cleanup_cuda_circuit_bootstrap_vertical_packing_64(void *v_stream,
uint32_t gpu_index,
void **cbs_vp_buffer);
}
#ifdef __CUDACC__

View File

@@ -137,12 +137,12 @@ __host__ void host_extract_bits(
void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
Torus *lwe_array_in, Torus *lwe_array_in_buffer,
Torus *lwe_array_in_shifted_buffer, Torus *lwe_array_out_ks_buffer,
Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs,
Torus *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk,
uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t number_of_samples, uint32_t max_shared_memory) {
Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, Torus *lut_vector_indexes,
Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits,
uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);

View File

@@ -248,9 +248,9 @@ __global__ void device_bootstrap_low_latency(
template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {

View File

@@ -1,5 +1,79 @@
#include "wop_bootstrap.cuh"
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap and vertical packing on 32 bits inputs, into
* `cbs_vp_buffer`. It also fills the value of delta_log to be used in the
* circuit bootstrap.
*/
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory) {
scratch_circuit_bootstrap_vertical_packing<uint32_t>(
v_stream, gpu_index, (uint32_t **)cbs_vp_buffer, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
number_of_inputs, tau, allocate_gpu_memory);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the circuit bootstrap and vertical packing on 64 bits inputs, into
* `cbs_vp_buffer`. It also fills the value of delta_log to be used in the
* circuit bootstrap.
*/
void scratch_cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory) {
scratch_circuit_bootstrap_vertical_packing<uint64_t>(
v_stream, gpu_index, (uint64_t **)cbs_vp_buffer, cbs_delta_log,
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
number_of_inputs, tau, allocate_gpu_memory);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the wop PBS on 32 bits inputs, into `wop_pbs_buffer`. It also fills the value
* of delta_log and cbs_delta_log to be used in the bit extract and circuit
* bootstrap.
*/
void scratch_cuda_wop_pbs_32(
void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer,
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs) {
scratch_wop_pbs<uint32_t>(v_stream, gpu_index, (uint32_t **)wop_pbs_buffer,
delta_log, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_count_cbs,
number_of_bits_of_message_including_padding,
number_of_bits_to_extract, number_of_inputs);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the wop PBS on 64 bits inputs, into `wop_pbs_buffer`. It also fills the value
* of delta_log and cbs_delta_log to be used in the bit extract and circuit
* bootstrap.
*/
void scratch_cuda_wop_pbs_64(
void *v_stream, uint32_t gpu_index, void **wop_pbs_buffer,
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs) {
scratch_wop_pbs<uint64_t>(v_stream, gpu_index, (uint64_t **)wop_pbs_buffer,
delta_log, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_count_cbs,
number_of_bits_of_message_including_padding,
number_of_bits_to_extract, number_of_inputs);
}
/*
* Entry point for cuda circuit bootstrap + vertical packing for batches of
* input 64 bit LWE ciphertexts.
@@ -12,6 +86,7 @@
* compressed complex key.
* - 'cbs_fpksk' list of private functional packing keyswitch keys
* - 'lut_vector' list of test vectors
* - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {512, 1024, 2048, 4096, 8192}
* - 'glwe_dimension' supported dimensions: {1}
@@ -29,11 +104,11 @@
*/
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t level_count_bsk, uint32_t base_log_bsk, uint32_t level_count_pksk,
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector, void *cbs_vp_buffer,
uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk,
uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
@@ -56,46 +131,51 @@ void cuda_circuit_bootstrap_vertical_packing_64(
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk,
(uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs,
lut_number, max_shared_memory);
break;
case 1024:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk,
(uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs,
lut_number, max_shared_memory);
break;
case 2048:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk,
(uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs,
lut_number, max_shared_memory);
break;
case 4096:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk,
(uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs,
lut_number, max_shared_memory);
break;
case 8192:
host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs, lut_number, max_shared_memory);
(double2 *)fourier_bsk, (uint64_t *)cbs_fpksk,
(uint64_t *)cbs_vp_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
polynomial_size, base_log_bsk, level_count_bsk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs, number_of_inputs,
lut_number, max_shared_memory);
break;
default:
break;
@@ -115,6 +195,7 @@ void cuda_circuit_bootstrap_vertical_packing_64(
* compressed complex key.
* - 'ksk' keyswitch key to use inside extract bits block
* - 'cbs_fpksk' list of fp-keyswitch keys
* - 'wop_pbs_buffer' a pre-allocated array to store intermediate results
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input lwe ciphertexts
* - 'polynomial_size' size of the test polynomial, supported sizes:
@@ -138,14 +219,15 @@ void cuda_circuit_bootstrap_vertical_packing_64(
*/
void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in, void *lut_vector, void *fourier_bsk,
void *ksk, void *cbs_fpksk, uint32_t glwe_dimension,
void *ksk, void *cbs_fpksk, void *wop_pbs_buffer,
uint32_t cbs_delta_log, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t base_log_pksk, uint32_t level_count_pksk,
uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract,
uint32_t number_of_bits_to_extract, uint32_t delta_log,
uint32_t number_of_inputs, uint32_t max_shared_memory) {
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
@@ -169,57 +251,100 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs,
(uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
number_of_inputs, max_shared_memory);
delta_log, number_of_inputs, max_shared_memory);
break;
case 1024:
host_wop_pbs<uint64_t, int64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs,
(uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
number_of_inputs, max_shared_memory);
delta_log, number_of_inputs, max_shared_memory);
break;
case 2048:
host_wop_pbs<uint64_t, int64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs,
(uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
number_of_inputs, max_shared_memory);
delta_log, number_of_inputs, max_shared_memory);
break;
case 4096:
host_wop_pbs<uint64_t, int64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs,
(uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
number_of_inputs, max_shared_memory);
delta_log, number_of_inputs, max_shared_memory);
break;
case 8192:
host_wop_pbs<uint64_t, int64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
(double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
level_count_bsk, base_log_ksk, level_count_ksk, base_log_pksk,
level_count_pksk, base_log_cbs, level_count_cbs,
(uint64_t *)wop_pbs_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_ksk, level_count_ksk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
number_of_inputs, max_shared_memory);
delta_log, number_of_inputs, max_shared_memory);
break;
default:
break;
}
}
/*
* This cleanup function frees the data for the wop PBS on GPU in wop_pbs_buffer
* for 32 bits inputs.
*/
void cleanup_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index,
void **wop_pbs_buffer) {
cleanup_wop_pbs<uint32_t>(v_stream, gpu_index, (uint32_t **)wop_pbs_buffer);
}
/*
* This cleanup function frees the data for the wop PBS on GPU in wop_pbs_buffer
* for 64 bits inputs.
*/
void cleanup_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index,
void **wop_pbs_buffer) {
cleanup_wop_pbs<uint64_t>(v_stream, gpu_index, (uint64_t **)wop_pbs_buffer);
}
/*
* This cleanup function frees the data for the circuit bootstrap and vertical
* packing on GPU in cbs_vp_buffer for 32 bits inputs.
*/
void cleanup_cuda_circuit_bootstrap_vertical_packing_32(void *v_stream,
uint32_t gpu_index,
void **cbs_vp_buffer) {
cleanup_circuit_bootstrap_vertical_packing<uint32_t>(
v_stream, gpu_index, (uint32_t **)cbs_vp_buffer);
}
/*
* This cleanup function frees the data for the circuit bootstrap and vertical
* packing on GPU in cbs_vp_buffer for 64 bits inputs.
*/
void cleanup_cuda_circuit_bootstrap_vertical_packing_64(void *v_stream,
uint32_t gpu_index,
void **cbs_vp_buffer) {
cleanup_circuit_bootstrap_vertical_packing<uint64_t>(
v_stream, gpu_index, (uint64_t **)cbs_vp_buffer);
}

View File

@@ -26,6 +26,88 @@ __global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
}
}
template <typename Torus>
__host__ __device__ int
get_buffer_size_cbs_vp(uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau) {
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
return number_of_inputs * level_count_cbs *
sizeof(Torus) // lut_vector_indexes
+ number_of_inputs * ggsw_size * sizeof(Torus) // ggsw_out
+
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1) * sizeof(Torus) // lwe_array_in_fp_ks_buffer
+ number_of_inputs * level_count_cbs * (polynomial_size + 1) *
sizeof(Torus) // lwe_array_out_pbs_buffer
+ number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
sizeof(Torus) // lwe_array_in_shifted_buffer
+ level_count_cbs * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus) // lut_vector_cbs
+ tau * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus); // glwe_array_out
}
template <typename Torus>
__host__ void scratch_circuit_bootstrap_vertical_packing(
void *v_stream, uint32_t gpu_index, Torus **cbs_vp_buffer,
uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// Allocate lut vector indexes on the CPU first to avoid blocking the stream
Torus *h_lut_vector_indexes =
(Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus));
// allocate and initialize device pointers for circuit bootstrap and vertical
// packing
if (allocate_gpu_memory) {
int buffer_size = get_buffer_size_cbs_vp<Torus>(
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
number_of_inputs, tau);
*cbs_vp_buffer = (Torus *)cuda_malloc_async(buffer_size, stream, gpu_index);
}
// indexes of lut vectors for cbs
for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) {
h_lut_vector_indexes[index] = index % level_count_cbs;
}
// lut_vector_indexes is the first buffer in the cbs_vp_buffer
cuda_memcpy_async_to_gpu(*cbs_vp_buffer, h_lut_vector_indexes,
number_of_inputs * level_count_cbs * sizeof(Torus),
stream, gpu_index);
check_cuda_error(cudaStreamSynchronize(*stream));
free(h_lut_vector_indexes);
check_cuda_error(cudaGetLastError());
uint32_t bits = sizeof(Torus) * 8;
*cbs_delta_log = (bits - 1);
}
/*
* Cleanup functions free the necessary data on the GPU and on the CPU.
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
* frees the data for the circuit bootstrap and vertical packing on GPU:
* - ggsw_out
* - lwe_array_in_fp_ks_buffer
* - lwe_array_out_pbs_buffer
* - lwe_array_in_shifted buffer
* - lut_vector_cbs
* - lut_vector_indexes
*/
template <typename Torus>
__host__ void
cleanup_circuit_bootstrap_vertical_packing(void *v_stream, uint32_t gpu_index,
Torus **cbs_vp_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
// Free memory
cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
}
// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
// i.e. tau * p where tau is the number of LUTs (the original number of LWEs
// before bit extraction) and p is the number of extracted bits
@@ -33,79 +115,46 @@ template <typename Torus, typename STorus, class params>
__host__ void host_circuit_bootstrap_vertical_packing(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk,
Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_pksk, uint32_t level_count_pksk, uint32_t base_log_cbs,
uint32_t level_count_cbs, uint32_t number_of_inputs, uint32_t tau,
uint32_t max_shared_memory) {
Torus *cbs_fpksk, Torus *cbs_vp_buffer, uint32_t cbs_delta_log,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_pksk,
uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// allocate and initialize device pointers for circuit bootstrap
// output ggsw array for cbs
int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size;
Torus *ggsw_out = (Torus *)cuda_malloc_async(
number_of_inputs * ggsw_size * sizeof(Torus), stream, gpu_index);
// input lwe array for fp-ks
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1) * sizeof(Torus),
stream, gpu_index);
// buffer for pbs output
Torus *lut_vector_indexes = (Torus *)cbs_vp_buffer;
Torus *ggsw_out = (Torus *)lut_vector_indexes +
(ptrdiff_t)(number_of_inputs * level_count_cbs);
Torus *lwe_array_in_fp_ks_buffer =
(Torus *)ggsw_out + (ptrdiff_t)(number_of_inputs * ggsw_size);
Torus *lwe_array_out_pbs_buffer =
(Torus *)cuda_malloc_async(number_of_inputs * level_count_cbs *
(polynomial_size + 1) * sizeof(Torus),
stream, gpu_index);
// vector for shifted lwe input
Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * (lwe_dimension + 1) * sizeof(Torus),
stream, gpu_index);
// lut vector buffer for cbs
Torus *lut_vector_cbs = (Torus *)cuda_malloc_async(
level_count_cbs * (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index);
// indexes of lut vectors for cbs
Torus *h_lut_vector_indexes =
(Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus));
for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) {
h_lut_vector_indexes[index] = index % level_count_cbs;
}
Torus *lut_vector_indexes = (Torus *)cuda_malloc_async(
number_of_inputs * level_count_cbs * sizeof(Torus), stream, gpu_index);
cuda_memcpy_async_to_gpu(
lut_vector_indexes, h_lut_vector_indexes,
number_of_inputs * level_count_cbs * sizeof(Torus), stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t bits = sizeof(Torus) * 8;
uint32_t delta_log = (bits - 1);
(Torus *)lwe_array_in_fp_ks_buffer +
(ptrdiff_t)(number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
(polynomial_size + 1));
Torus *lwe_array_in_shifted_buffer =
(Torus *)lwe_array_out_pbs_buffer +
(ptrdiff_t)(number_of_inputs * level_count_cbs * (polynomial_size + 1));
Torus *lut_vector_cbs =
(Torus *)lwe_array_in_shifted_buffer +
(ptrdiff_t)(number_of_inputs * level_count_cbs * (lwe_dimension + 1));
Torus *glwe_array_out =
(Torus *)lut_vector_cbs +
(ptrdiff_t)(level_count_cbs * (glwe_dimension + 1) * polynomial_size);
host_circuit_bootstrap<Torus, params>(
v_stream, gpu_index, ggsw_out, lwe_array_in, fourier_bsk, cbs_fpksk,
lwe_array_in_shifted_buffer, lut_vector_cbs, lut_vector_indexes,
lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, delta_log,
lwe_array_out_pbs_buffer, lwe_array_in_fp_ks_buffer, cbs_delta_log,
polynomial_size, glwe_dimension, lwe_dimension, level_count_bsk,
base_log_bsk, level_count_pksk, base_log_pksk, level_count_cbs,
base_log_cbs, number_of_inputs, max_shared_memory);
check_cuda_error(cudaGetLastError());
// Free memory
cuda_drop_async(lwe_array_in_fp_ks_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index);
cuda_drop_async(lut_vector_cbs, stream, gpu_index);
cuda_drop_async(lut_vector_indexes, stream, gpu_index);
free(h_lut_vector_indexes);
// number_of_inputs = tau * p is the total number of GGSWs
// split the vec of GGSW in two, the msb GGSW is for the CMux tree and the
// lsb GGSW is for the last blind rotation.
uint32_t r = number_of_inputs - params::log2_degree;
Torus *glwe_array_out = (Torus *)cuda_malloc_async(
tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
gpu_index);
// CMUX Tree
// r = tau * p - log2(N)
host_cmux_tree<Torus, STorus, params>(
@@ -123,52 +172,109 @@ __host__ void host_circuit_bootstrap_vertical_packing(
v_stream, gpu_index, lwe_array_out, br_ggsw, glwe_array_out,
number_of_inputs - r, tau, glwe_dimension, polynomial_size, base_log_cbs,
level_count_cbs, max_shared_memory);
}
cuda_drop_async(glwe_array_out, stream, gpu_index);
cuda_drop_async(ggsw_out, stream, gpu_index);
template <typename Torus>
__host__ __device__ int
get_buffer_size_wop_pbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract,
uint32_t number_of_inputs) {
return sizeof(Torus) // lut_vector_indexes
+ ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
+ (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_buffer
+ (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_shifted_buffer
+ (lwe_dimension + 1) * sizeof(Torus) // lwe_array_out_ks_buffer
+ (polynomial_size + 1) * sizeof(Torus) // lwe_array_out_pbs_buffer
+ (lwe_dimension + 1) * // lwe_array_out_bit_extract
(number_of_bits_of_message_including_padding) * sizeof(Torus);
}
template <typename Torus>
__host__ void
scratch_wop_pbs(void *v_stream, uint32_t gpu_index, Torus **wop_pbs_buffer,
uint32_t *delta_log, uint32_t *cbs_delta_log,
uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
int wop_pbs_buffer_size = get_buffer_size_wop_pbs<Torus>(
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
number_of_inputs);
int buffer_size =
get_buffer_size_cbs_vp<Torus>(
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
number_of_inputs * number_of_bits_to_extract, number_of_inputs) +
wop_pbs_buffer_size;
*wop_pbs_buffer = (Torus *)cuda_malloc_async(buffer_size, stream, gpu_index);
// indexes of lut vectors for bit extract
Torus h_lut_vector_indexes = 0;
// lut_vector_indexes is the first array in the wop_pbs buffer
cuda_memcpy_async_to_gpu(*wop_pbs_buffer, &h_lut_vector_indexes,
sizeof(Torus), stream, gpu_index);
check_cuda_error(cudaGetLastError());
uint32_t ciphertext_total_bits_count = sizeof(Torus) * 8;
*delta_log =
ciphertext_total_bits_count - number_of_bits_of_message_including_padding;
Torus *cbs_vp_buffer =
*wop_pbs_buffer +
(ptrdiff_t)(
1 + ((glwe_dimension + 1) * polynomial_size) + (polynomial_size + 1) +
(polynomial_size + 1) + (lwe_dimension + 1) + (polynomial_size + 1) +
(lwe_dimension + 1) * (number_of_bits_of_message_including_padding));
scratch_circuit_bootstrap_vertical_packing<Torus>(
v_stream, gpu_index, &cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, level_count_cbs,
number_of_inputs * number_of_bits_to_extract, number_of_inputs, false);
}
/*
* Cleanup functions free the necessary data on the GPU and on the CPU.
* Data that lives on the CPU is prefixed with `h_`. This cleanup function thus
* frees the data for the wop PBS on GPU in wop_pbs_buffer
*/
template <typename Torus>
__host__ void cleanup_wop_pbs(void *v_stream, uint32_t gpu_index,
Torus **wop_pbs_buffer) {
auto stream = static_cast<cudaStream_t *>(v_stream);
cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
}
template <typename Torus, typename STorus, class params>
__host__ void host_wop_pbs(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, Torus *ksk,
Torus *cbs_fpksk, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t base_log_pksk,
uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs,
Torus *cbs_fpksk, Torus *wop_pbs_buffer, uint32_t cbs_delta_log,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk,
uint32_t level_count_ksk, uint32_t base_log_pksk, uint32_t level_count_pksk,
uint32_t base_log_cbs, uint32_t level_count_cbs,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
uint32_t max_shared_memory) {
uint32_t number_of_bits_to_extract, uint32_t delta_log,
uint32_t number_of_inputs, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
// let mut h_lut_vector_indexes = vec![0 as u32; 1];
// indexes of lut vectors for bit extract
Torus *h_lut_vector_indexes = (Torus *)malloc(sizeof(Torus));
h_lut_vector_indexes[0] = 0;
Torus *lut_vector_indexes =
(Torus *)cuda_malloc_async(sizeof(Torus), stream, gpu_index);
cuda_memcpy_async_to_gpu(lut_vector_indexes, h_lut_vector_indexes,
sizeof(Torus), stream, gpu_index);
check_cuda_error(cudaGetLastError());
Torus *lut_pbs = (Torus *)cuda_malloc_async(
(2 * polynomial_size) * sizeof(Torus), stream, gpu_index);
Torus *lwe_array_in_buffer = (Torus *)cuda_malloc_async(
(polynomial_size + 1) * sizeof(Torus), stream, gpu_index);
Torus *lwe_array_in_shifted_buffer = (Torus *)cuda_malloc_async(
(polynomial_size + 1) * sizeof(Torus), stream, gpu_index);
Torus *lwe_array_out_ks_buffer = (Torus *)cuda_malloc_async(
(lwe_dimension + 1) * sizeof(Torus), stream, gpu_index);
Torus *lwe_array_out_pbs_buffer = (Torus *)cuda_malloc_async(
(polynomial_size + 1) * sizeof(Torus), stream, gpu_index);
Torus *lwe_array_out_bit_extract = (Torus *)cuda_malloc_async(
(lwe_dimension + 1) * (number_of_bits_of_message_including_padding) *
sizeof(Torus),
stream, gpu_index);
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
uint32_t delta_log =
ciphertext_n_bits - number_of_bits_of_message_including_padding;
// lut_vector_indexes is the first array in the wop_pbs buffer
Torus *lut_vector_indexes = (Torus *)wop_pbs_buffer;
Torus *lut_pbs = (Torus *)lut_vector_indexes + (ptrdiff_t)(1);
Torus *lwe_array_in_buffer =
(Torus *)lut_pbs + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
Torus *lwe_array_in_shifted_buffer =
(Torus *)lwe_array_in_buffer + (ptrdiff_t)(polynomial_size + 1);
Torus *lwe_array_out_ks_buffer =
(Torus *)lwe_array_in_shifted_buffer + (ptrdiff_t)(polynomial_size + 1);
Torus *lwe_array_out_pbs_buffer =
(Torus *)lwe_array_out_ks_buffer + (ptrdiff_t)(lwe_dimension + 1);
Torus *lwe_array_out_bit_extract =
(Torus *)lwe_array_out_pbs_buffer + (ptrdiff_t)(polynomial_size + 1);
host_extract_bits<Torus, params>(
v_stream, gpu_index, lwe_array_out_bit_extract, lwe_array_in,
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer,
@@ -177,22 +283,20 @@ __host__ void host_wop_pbs(
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_inputs, max_shared_memory);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lut_pbs, stream, gpu_index);
cuda_drop_async(lut_vector_indexes, stream, gpu_index);
cuda_drop_async(lwe_array_in_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_in_shifted_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_out_ks_buffer, stream, gpu_index);
cuda_drop_async(lwe_array_out_pbs_buffer, stream, gpu_index);
Torus *cbs_vp_buffer =
(Torus *)wop_pbs_buffer +
(ptrdiff_t)(
1 + ((glwe_dimension + 1) * polynomial_size) + (polynomial_size + 1) +
(polynomial_size + 1) + (lwe_dimension + 1) + (polynomial_size + 1) +
(lwe_dimension + 1) * number_of_bits_of_message_including_padding);
host_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
v_stream, gpu_index, lwe_array_out, lwe_array_out_bit_extract, lut_vector,
fourier_bsk, cbs_fpksk, glwe_dimension, lwe_dimension, polynomial_size,
base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
base_log_cbs, level_count_cbs,
fourier_bsk, cbs_fpksk, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
number_of_inputs * number_of_bits_to_extract, number_of_inputs,
max_shared_memory);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lwe_array_out_bit_extract, stream, gpu_index);
}
#endif // WOP_PBS_H