From bc90576454e855e93f8d5cd0a11ae899efdc3e8d Mon Sep 17 00:00:00 2001 From: Beka Barbakadze Date: Wed, 11 Jan 2023 16:14:33 +0400 Subject: [PATCH] docs(cuda): add Rust doc for all concrete-cuda entry points --- include/bootstrap.h | 1 + src/addition.cu | 63 ++++++++++++++++ src/bit_extraction.cu | 48 ++++++++++++ src/bit_extraction.cuh | 65 +++++++++++------ src/bootstrap_amortized.cu | 122 +++++++++++++++++-------------- src/bootstrap_low_latency.cu | 137 +++++++++++++++++++++-------------- src/circuit_bootstrap.cu | 40 ++++++++++ src/circuit_bootstrap.cuh | 37 ++++++++-- src/keyswitch.cu | 49 +++++++++---- src/multiplication.cu | 32 ++++++++ src/negation.cu | 27 +++++++ src/vertical_packing.cu | 54 ++++++++++++++ src/wop_bootstrap.cu | 61 ++++++++++++++++ 13 files changed, 585 insertions(+), 151 deletions(-) diff --git a/include/bootstrap.h b/include/bootstrap.h index da28cde2b..3ba371236 100644 --- a/include/bootstrap.h +++ b/include/bootstrap.h @@ -86,6 +86,7 @@ void cuda_extract_bits_64( uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples, uint32_t max_shared_memory); + void cuda_circuit_bootstrap_32( void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, diff --git a/src/addition.cu b/src/addition.cu index fcaa669f0..c998c5de2 100644 --- a/src/addition.cu +++ b/src/addition.cu @@ -1,5 +1,9 @@ #include "addition.cuh" +/* + * Perform the addition of two u32 input LWE ciphertext vectors. + * See the equivalent operation on u64 ciphertexts for more details. + */ void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in_1, @@ -12,6 +16,33 @@ void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } + +/* + * Perform the addition of two u64 input LWE ciphertext vectors. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - `lwe_array_out` is an array of size + * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have + * been allocated on the GPU before calling this function, and that will hold + * the result of the computation. + * - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it + * should have been allocated and initialized before calling this function. It + * has the same size as the output array. + * - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it + * should have been allocated and initialized before calling this function. It + * has the same size as the output array. + * - `input_lwe_dimension` is the number of mask elements in the two input and + * in the output ciphertext vectors + * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each + * input LWE ciphertext vector, as well as in the output. + * + * Each element (mask element or body) of the input LWE ciphertext vector 1 is + * added to the corresponding element in the input LWE ciphertext 2. The result + * is stored in the output LWE ciphertext vector. The two input LWE ciphertext + * vectors are left unchanged. This function is a wrapper to a device function + * that performs the operation on the GPU. + */ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in_1, @@ -24,6 +55,10 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, static_cast(lwe_array_in_2), input_lwe_dimension, input_lwe_ciphertext_count); } +/* + * Perform the addition of a u32 input LWE ciphertext vector with a u32 + * plaintext vector. See the equivalent operation on u64 data for more details. + */ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *plaintext_array_in, uint32_t input_lwe_dimension, @@ -35,6 +70,34 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32( static_cast(plaintext_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } +/* + * Perform the addition of a u64 input LWE ciphertext vector with a u64 input + * plaintext vector. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - `lwe_array_out` is an array of size + * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have + * been allocated on the GPU before calling this function, and that will hold + * the result of the computation. + * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have + * been allocated and initialized before calling this function. It has the same + * size as the output array. + * - `plaintext_array_in` is the plaintext vector used as input, it should have + * been allocated and initialized before calling this function. It should be of + * size `input_lwe_ciphertext_count`. + * - `input_lwe_dimension` is the number of mask elements in the input and + * output LWE ciphertext vectors + * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the + * input LWE ciphertext vector, as well as in the output. It is also the number + * of plaintexts in the input plaintext vector. + * + * Each plaintext of the input plaintext vector is added to the body of the + * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the + * operation is stored in the output LWE ciphertext vector. The two input + * vectors are unchanged. This function is a wrapper to a device function that + * performs the operation on the GPU. + */ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *plaintext_array_in, uint32_t input_lwe_dimension, diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu index e02a254dc..5d9f29065 100644 --- a/src/bit_extraction.cu +++ b/src/bit_extraction.cu @@ -1,5 +1,8 @@ #include "bit_extraction.cuh" +/* Perform bit extract on a batch of 32 bit LWE ciphertexts. + * See the corresponding function on 64 bit LWE ciphertexts for more details. + */ void cuda_extract_bits_32( void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, void *lwe_array_in, void *lwe_array_in_buffer, @@ -97,6 +100,51 @@ void cuda_extract_bits_32( } } +/* Perform bit extract on a batch of 64 bit lwe ciphertexts. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - 'number_of_bits' will be extracted from each ciphertext + * starting at the bit number 'delta_log' (0-indexed) included. + * Output bits are ordered from the MSB to LSB. Every extracted bit is + * represented as an LWE ciphertext, containing the encryption of the bit scaled + * by q/2. + * - 'list_lwe_array_out' output batch LWE ciphertexts for each bit of every + * input ciphertext + * - 'lwe_array_in' batch of input LWE ciphertexts, with size - + * ('lwe_dimension_in' + 1) * number_of_samples * sizeof(u64) + * The following 5 parameters are used during calculations, they are not actual + * inputs of the function they are just allocated memory for calculation + * process, like this, memory can be allocated once and can be used as much + * as needed for different calls of extract_bit function. + * - 'lwe_array_in_buffer' same size as 'lwe_array_in' + * - 'lwe_array_in_shifted_buffer' same size as 'lwe_array_in' + * - 'lwe_array_out_ks_buffer' with size: + * ('lwe_dimension_out' + 1) * number_of_samples * sizeof(u64) + * - 'lwe_array_out_pbs_buffer' same size as 'lwe_array_in' + * - 'lut_pbs' with size: + * (glwe_dimension + 1) * (lwe_dimension_in + 1) * sizeof(u64) + * The other inputs are: + * - 'lut_vector_indexes' stores the index corresponding to which test + * vector to use + * - 'ksk' keyswitch key + * - 'fourier_bsk' complex compressed bsk in fourier domain + * - 'lwe_dimension_in' input LWE ciphertext dimension, supported input + * dimensions are: {512, 1024,2048, 4096, 8192} + * - 'lwe_dimension_out' output LWE ciphertext dimension + * - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported + * for now + * - 'base_log_bsk' base_log for bootstrapping + * - 'level_count_bsk' decomposition level count for bootstrapping + * - 'base_log_ksk' base_log for keyswitch + * - 'level_count_ksk' decomposition level for keyswitch + * - 'number_of_samples' number of input LWE ciphertexts + * - 'max_shared_memory' maximum amount of shared memory to be used inside + * device functions + * + * This function will call corresponding template of wrapper host function which + * will manage the calls of device functions. + */ void cuda_extract_bits_64( void *v_stream, uint32_t gpu_index, void *list_lwe_array_out, void *lwe_array_in, void *lwe_array_in_buffer, diff --git a/src/bit_extraction.cuh b/src/bit_extraction.cuh index 37fbfc3a1..f8d6539a0 100644 --- a/src/bit_extraction.cuh +++ b/src/bit_extraction.cuh @@ -11,10 +11,12 @@ #include "polynomial/parameters.cuh" #include "utils/timer.cuh" -// only works for big lwe for ks+bs case -// state_lwe_buffer is copied from big lwe input -// shifted_lwe_buffer is scalar multiplication of lwe input -// blockIdx.x refers to input ciphertext id +/* + * Function copies batch lwe input to two different buffers, + * one is shifted by value + * one is copied without any modification + * works for ciphertexts with sizes supported by params::degree + */ template __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift, Torus *src, Torus value) { @@ -37,8 +39,10 @@ __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift, } } -// only works for small lwe in ks+bs case -// function copies lwe when length is not a power of two +/* + * Function copies batch of lwe to lwe when size is not supported by + * params::degree + */ template __global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, uint32_t number_of_bits, uint32_t lwe_id) { @@ -62,24 +66,27 @@ __global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size, cur_dst[tid] = cur_src[tid]; } -// only used in extract bits for one ciphertext -// should be called with one block and one thread -// NOTE: check if putting this functionality in copy_small_lwe or -// fill_pbs_lut vector is faster +/* + * Function used to wrapping add value on the body of ciphertexts, + * should be called with blocksize.x = 1; + * blickIdx.x refers id of ciphertext + * NOTE: check if putting thi functionality in copy_small_lwe or fill_pbs_lut + * is faster + */ template __global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) { lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value; } -// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 -// if the extracted bit was 0 and 1 in the other case -// -// Remove the extracted bit from the state LWE to get a 0 at the extracted bit -// location. -// -// Shift on padding bit for next iteration, that's why -// alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used -// instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1) +/* + * Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0 + * if the extracted bit was 0 and 1 in the other case + * Remove the extracted bit from the state LWE to get a 0 at the extracted bit + * location. + * Shift on padding bit for next iteration, that's why + * alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used + * instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1) + */ template __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe, Torus *pbs_lwe_array_out, Torus add_value, @@ -104,9 +111,11 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe, } } -// Fill lut(only body) for the current bit (equivalent to trivial encryption as -// mask is 0s) -// The LUT is filled with value +/* + * Fill lut(only body) for the current bit, equivalent to trivial encryption as + * msk is 0s + * blockIdx.x refers id of lut vector + */ template __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) { @@ -119,6 +128,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) { } } +/* + * Host function for cuda extract bits. + * it executes device functions in specific order and manages + * parallelism + */ template __host__ void host_extract_bits( void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out, @@ -137,6 +151,7 @@ __host__ void host_extract_bits( int blocks = 1; int threads = params::degree / params::opt; + // shift lwe on padding bit and copy in new buffer copy_and_shift_lwe<<>>( lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in, 1ll << (ciphertext_n_bits - delta_log - 1)); @@ -157,11 +172,15 @@ __host__ void host_extract_bits( break; } + // Add q/4 to center the error while computing a negacyclic LUT add_to_body<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, lwe_dimension_out, 1ll << (ciphertext_n_bits - 2)); checkCudaErrors(cudaGetLastError()); + // Fill lut for the current bit (equivalent to trivial encryption as mask is + // 0s) The LUT is filled with -alpha in each coefficient where alpha = + // delta*2^{bit_idx-1} fill_lut_body_for_current_bit <<>>( lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx)); @@ -173,6 +192,8 @@ __host__ void host_extract_bits( lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk, number_of_samples, 1, max_shared_memory); + // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption + // of 0 if the extracted bit was 0 and 1 in the other case add_sub_and_mul_lwe<<<1, threads, 0, *stream>>>( lwe_array_in_shifted_buffer, lwe_array_in_buffer, lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx), diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu index 11d06e985..612f5c33b 100644 --- a/src/bootstrap_amortized.cu +++ b/src/bootstrap_amortized.cu @@ -1,60 +1,7 @@ #include "bootstrap_amortized.cuh" -/* Perform bootstrapping on a batch of input LWE ciphertexts - * - * - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c = - * (a0,..an-1,b) where n is the LWE dimension - * - lut_vector: should hold as many test vectors of size polynomial_size - * as there are input ciphertexts, but actually holds - * num_lut_vectors vectors to reduce memory usage - * - lut_vector_indexes: stores the index corresponding to - * which test vector to use for each sample in - * lut_vector - * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n - * mask values + 1 body value - * - bootstrapping_key: RGSW encryption of the LWE secret key sk1 - * under secret key sk2 - * bsk = Z + sk1 H - * where H is the gadget matrix and Z is a matrix (k+1).l - * containing GLWE encryptions of 0 under sk2. - * bsk is thus a tensor of size (k+1)^2.l.N.n - * where l is the number of decomposition levels and - * k is the GLWE dimension, N is the polynomial size for - * GLWE. The polynomial size for GLWE and the test vector - * are the same because they have to be in the same ring - * to be multiplied. - * Note: it is necessary to generate (k+1).k.l.N.n - * uniformly random coefficients for the zero encryptions - * - input_lwe_dimension: size of the Torus vector used to encrypt the input - * LWE ciphertexts - referred to as n above (~ 600) - * - polynomial_size: size of the test polynomial (test vector) and size of the - * GLWE polynomial (~1024) - * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) - * - level_count: number of decomposition levels in the gadget matrix (~4) - * - num_samples: number of encrypted input messages - * - num_lut_vectors: parameter to set the actual number of test vectors to be - * used - * - q: number of bytes in the integer representation (32 or 64) - * - * This function calls a wrapper to a device kernel that performs the - * bootstrapping: - * - the kernel is templatized based on integer discretization and - * polynomial degree - * - num_samples blocks of threads are launched, where each thread is going - * to handle one or more polynomial coefficients at each stage: - * - perform the blind rotation - * - round the result - * - decompose into level_count levels, then for each level: - * - switch to the FFT domain - * - multiply with the bootstrapping key - * - come back to the coefficients representation - * - between each stage a synchronization of the threads is necessary - * - in case the device has enough shared memory, temporary arrays used for - * the different stages (accumulators) are stored into the shared memory - * - the accumulators serve to combine the results for all decomposition - * levels - * - the constant memory (64K) is used for storing the roots of identity - * values for the FFT +/* Perform the programmable bootstrapping on a batch of input u32 LWE + * ciphertexts. See the corresponding operation on 64 bits for more details. */ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( @@ -115,6 +62,71 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32( } } +/* Perform the programmable bootstrapping on a batch of input u64 LWE + * ciphertexts. This functions performs best for large numbers of inputs (> 10). + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c = + * (a0,..an-1,b) where n is the LWE dimension + * - lut_vector: should hold as many test vectors of size polynomial_size + * as there are input ciphertexts, but actually holds + * num_lut_vectors vectors to reduce memory usage + * - lut_vector_indexes: stores the index corresponding to + * which test vector of lut_vector to use for each LWE input in + * lwe_array_in + * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n + * mask values + 1 body value + * - bootstrapping_key: GGSW encryption of the LWE secret key sk1 + * under secret key sk2 + * bsk = Z + sk1 H + * where H is the gadget matrix and Z is a matrix (k+1).l + * containing GLWE encryptions of 0 under sk2. + * bsk is thus a tensor of size (k+1)^2.l.N.n + * where l is the number of decomposition levels and + * k is the GLWE dimension, N is the polynomial size for + * GLWE. The polynomial size for GLWE and the test vector + * are the same because they have to be in the same ring + * to be multiplied. + * - input_lwe_dimension: size of the Torus vector used to encrypt the input + * LWE ciphertexts - referred to as n above (~ 600) + * - polynomial_size: size of the test polynomial (test vector) and size of the + * GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1). + * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8) + * - level_count: number of decomposition levels in the gadget matrix (~4) + * - num_samples: number of encrypted input messages + * - num_lut_vectors: parameter to set the actual number of test vectors to be + * used + * - lwe_idx: the index of the LWE input to consider for the GPU of index + * gpu_index. In case of multi-GPU computing, it is assumed that only a part of + * the input LWE array is copied to each GPU, but the whole LUT array is copied + * (because the case when the number of LUTs is smaller than the number of input + * LWEs is not trivial to take into account in the data repartition on the + * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE + * input in the LUT array `lut_vector`. + * - 'max_shared_memory' maximum amount of shared memory to be used inside + * device functions + * + * This function calls a wrapper to a device kernel that performs the + * bootstrapping: + * - the kernel is templatized based on integer discretization and + * polynomial degree + * - num_samples blocks of threads are launched, where each thread is going + * to handle one or more polynomial coefficients at each stage: + * - perform the blind rotation + * - round the result + * - decompose into level_count levels, then for each level: + * - switch to the FFT domain + * - multiply with the bootstrapping key + * - come back to the coefficients representation + * - between each stage a synchronization of the threads is necessary + * - in case the device has enough shared memory, temporary arrays used for + * the different stages (accumulators) are stored into the shared memory + * - the accumulators serve to combine the results for all decomposition + * levels + * - the constant memory (64K) is used for storing the roots of identity + * values for the FFT + */ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector, void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key, diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 92cd6943b..246963314 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -1,60 +1,11 @@ #include "bootstrap_low_latency.cuh" -/* Perform bootstrapping on a batch of input LWE ciphertexts - * - * - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c = - * (a0,..an-1,b) where n is the LWE dimension - * - lut_vector: should hold as many test vectors of size polynomial_size - * as there are input ciphertexts, but actually holds - * num_lut_vectors vectors to reduce memory usage - * - lut_vector_indexes: stores the index corresponding to - * which test vector to use for each sample in - * lut_vector - * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n - * mask values + 1 body value - * - bootstrapping_key: RGSW encryption of the LWE secret key sk1 - * under secret key sk2 - * bsk = Z + sk1 H - * where H is the gadget matrix and Z is a matrix (k+1).l - * containing GLWE encryptions of 0 under sk2. - * bsk is thus a tensor of size (k+1)^2.l.N.n - * where l is the number of decomposition levels and - * k is the GLWE dimension, N is the polynomial size for - * GLWE. The polynomial size for GLWE and the test vector - * are the same because they have to be in the same ring - * to be multiplied. - * Note: it is necessary to generate (k+1).k.l.N.n - * uniformly random coefficients for the zero encryptions - * - lwe_dimension: size of the Torus vector used to encrypt the input - * LWE ciphertexts - referred to as n above (~ 600) - * - polynomial_size: size of the test polynomial (test vector) and size of the - * GLWE polynomial (~1024) - * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) - * - level_count: number of decomposition levels in the gadget matrix (~4) - * - num_samples: number of encrypted input messages - * - num_lut_vectors: parameter to set the actual number of test vectors to be - * used - * - q: number of bytes in the integer representation (32 or 64) - * - * This function calls a wrapper to a device kernel that performs the - * bootstrapping: - * - the kernel is templatized based on integer discretization and - * polynomial degree - * - num_samples blocks of threads are launched, where each thread is going - * to handle one or more polynomial coefficients at each stage: - * - perform the blind rotation - * - round the result - * - decompose into level_count levels, then for each level: - * - switch to the FFT domain - * - multiply with the bootstrapping key - * - come back to the coefficients representation - * - between each stage a synchronization of the threads is necessary - * - in case the device has enough shared memory, temporary arrays used for - * the different stages (accumulators) are stored into the shared memory - * - the accumulators serve to combine the results for all decomposition - * levels - * - the constant memory (64K) is used for storing the roots of identity - * values for the FFT +/* Perform bootstrapping on a batch of input u32 LWE ciphertexts. + * This function performs best for small numbers of inputs. Beyond a certain + * number of inputs (the exact number depends on the cryptographic parameters), + * the kernel cannot be launched and it is necessary to split the kernel call + * into several calls on smaller batches of inputs. For more details on this + * operation, head on to the equivalent u64 operation. */ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector, @@ -124,6 +75,82 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32( } } +/* Perform bootstrapping on a batch of input u64 LWE ciphertexts. + * This function performs best for small numbers of inputs. Beyond a certain + * number of inputs (the exact number depends on the cryptographic parameters), + * the kernel cannot be launched and it is necessary to split the kernel call + * into several calls on smaller batches of inputs. + * + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c = + * (a0,..an-1,b) where n is the LWE dimension + * - lut_vector: should hold as many test vectors of size polynomial_size + * as there are input ciphertexts, but actually holds + * num_lut_vectors vectors to reduce memory usage + * - lut_vector_indexes: stores the index corresponding to + * which test vector to use for each sample in + * lut_vector + * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n + * mask values + 1 body value + * - bootstrapping_key: GGSW encryption of the LWE secret key sk1 + * under secret key sk2 + * bsk = Z + sk1 H + * where H is the gadget matrix and Z is a matrix (k+1).l + * containing GLWE encryptions of 0 under sk2. + * bsk is thus a tensor of size (k+1)^2.l.N.n + * where l is the number of decomposition levels and + * k is the GLWE dimension, N is the polynomial size for + * GLWE. The polynomial size for GLWE and the test vector + * are the same because they have to be in the same ring + * to be multiplied. + * - lwe_dimension: size of the Torus vector used to encrypt the input + * LWE ciphertexts - referred to as n above (~ 600) + * - glwe_dimension: size of the polynomial vector used to encrypt the LUT + * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for + * this parameter. + * - polynomial_size: size of the test polynomial (test vector) and size of the + * GLWE polynomial (~1024) + * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) + * - level_count: number of decomposition levels in the gadget matrix (~4) + * - num_samples: number of encrypted input messages + * - num_lut_vectors: parameter to set the actual number of test vectors to be + * used + * - lwe_idx: the index of the LWE input to consider for the GPU of index + * gpu_index. In case of multi-GPU computing, it is assumed that only a part of + * the input LWE array is copied to each GPU, but the whole LUT array is copied + * (because the case when the number of LUTs is smaller than the number of input + * LWEs is not trivial to take into account in the data repartition on the + * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE + * input in the LUT array `lut_vector`. + * - 'max_shared_memory' maximum amount of shared memory to be used inside + * device functions + * + * This function calls a wrapper to a device kernel that performs the + * bootstrapping: + * - the kernel is templatized based on integer discretization and + * polynomial degree + * - num_samples * level_count * (glwe_dimension + 1) blocks of threads are + * launched, where each thread is going to handle one or more polynomial + * coefficients at each stage, for a given level of decomposition, either for + * the LUT mask or its body: + * - perform the blind rotation + * - round the result + * - get the decomposition for the current level + * - switch to the FFT domain + * - multiply with the bootstrapping key + * - come back to the coefficients representation + * - between each stage a synchronization of the threads is necessary (some + * synchronizations happen at the block level, some happen between blocks, using + * cooperative groups). + * - in case the device has enough shared memory, temporary arrays used for + * the different stages (accumulators) are stored into the shared memory + * - the accumulators serve to combine the results for all decomposition + * levels + * - the constant memory (64K) is used for storing the roots of identity + * values for the FFT + */ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector, void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key, diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu index d43d531cc..df3622ec7 100644 --- a/src/circuit_bootstrap.cu +++ b/src/circuit_bootstrap.cu @@ -1,5 +1,9 @@ #include "circuit_bootstrap.cuh" +/* + * Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts. + * Head out to the equivalent operation on 64 bits for more details. + */ void cuda_circuit_bootstrap_32( void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, @@ -88,6 +92,42 @@ void cuda_circuit_bootstrap_32( } } +/* + * Perform circuit bootstrapping on a batch of 64 bit input LWE ciphertexts. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - 'ggsw_out' output batch of ggsw with size: + * 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 * + * polynomial_size * sizeof(u64) + * - 'lwe_array_in' input batch of lwe ciphertexts, with size: + * 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64) + * - 'fourier_bsk' bootstrapping key in fourier domain with size: + * 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 * + * 'polynomial_size' / 2 * sizeof(double2) + * - 'fp_ksk_array' batch of fp-keyswitch keys with size: + * ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 * + * 'polynomial_size' * sizeof(u64) + * The following 5 parameters are used during calculations, they are not actual + * inputs of the function, they are just allocated memory for calculation + * process, like this, memory can be allocated once and can be used as much + * as needed for different calls of circuit_bootstrap function + * - 'lwe_array_in_shifted_buffer' with size: + * 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64) + * - 'lut_vector' with size: + * 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64) + * - 'lut_vector_indexes' stores the index corresponding to which test + * vector to use + * - 'lwe_array_out_pbs_buffer' with size + * 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64) + * - 'lwe_array_in_fp_ks_buffer' with size + * 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) * + * ('polynomial_size' + 1) * sizeof(u64) + * + * This function calls a wrapper to a device kernel that performs the + * circuit bootstrap. The kernel is templatized based on integer discretization + * and polynomial degree. + */ void cuda_circuit_bootstrap_64( void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in, void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer, diff --git a/src/circuit_bootstrap.cuh b/src/circuit_bootstrap.cuh index 71419b3cc..9b3aa36a6 100644 --- a/src/circuit_bootstrap.cuh +++ b/src/circuit_bootstrap.cuh @@ -10,9 +10,11 @@ #include "polynomial/parameters.cuh" #include "utils/timer.cuh" -// works for lwe with generic sizes -// shifted_lwe_buffer is scalar multiplication of lwe input -// blockIdx.x refers to input ciphertext id +/* + * scalar multiplication to value for batch of lwe_ciphertext + * works for any size of lwe input + * blockIdx.x refers to input ciphertext it + */ template __global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value, size_t lwe_size) { @@ -35,9 +37,14 @@ __global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value, cur_dst[tid] = cur_src[tid] * value; } -// Fill lut (equivalent to trivial encryption as mask is 0s) -// The LUT is filled with -alpha in each coefficient where -// alpha = 2^{log(q) - 1 - base_log * level} +/* + * Fill lut, equivalent to trivial encryption as mask is 0s. + * The LUT is filled with -alpha in each coefficient where + * alpha = 2^{log(q) - 1 - base_log * level} + * blockIdx.x refers to lut id + * value is not passed and calculated inside function because lut id is one + * of the variable. + */ template __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits, uint32_t base_log_cbs) { @@ -55,6 +62,19 @@ __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits, } } +/* + * copy pbs result (glwe_dimension + 1) times to be an input of fp-ks + * each of the input ciphertext from lwe_src is copied (glwe_dimension + 1) + * times inside lwe_dst, and then value is added to the body. + * blockIdx.x refers to destination lwe ciphertext id: 'dst_lwe_id' + * 'src_lwe_id' = 'dst_lwe_id' / (glwe_dimension + 1) + * + * example: glwe_dimension = 1 + * src_0 ... src_n + * / \ / \ + * / \ / \ + * dst_0 dst_1 dst_2n dst_2n+1 + */ template __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src, uint32_t ciphertext_n_bits, @@ -77,6 +97,11 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src, } } +/* + * Host function for cuda circuit bootstrap. + * It executes device functions in specific order and manages + * parallelism + */ template __host__ void host_circuit_bootstrap( void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in, diff --git a/src/keyswitch.cu b/src/keyswitch.cu index c9b9377ac..51bc397fb 100644 --- a/src/keyswitch.cu +++ b/src/keyswitch.cu @@ -4,15 +4,8 @@ #include -/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits - * - * - lwe_array_out: output batch of num_samples keyswitched ciphertexts c = - * (a0,..an-1,b) where n is the LWE dimension - * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n - * mask values + 1 body value - * - * This function calls a wrapper to a device kernel that performs the keyswitch - * - num_samples blocks of threads are launched +/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts. + * Head out to the equivalent operation on 64 bits for more details. */ void cuda_keyswitch_lwe_ciphertext_vector_32( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, @@ -24,12 +17,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32( lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } -/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits +/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts. * + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch * - lwe_array_out: output batch of num_samples keyswitched ciphertexts c = - * (a0,..an-1,b) where n is the LWE dimension - * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n - * mask values + 1 body value + * (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out) + * - lwe_array_in: input batch of num_samples LWE ciphertexts, containing + * lwe_dimension_in mask values + 1 body value + * - ksk: the keyswitch key to be used in the operation + * - base log: the log of the base used in the decomposition (should be the one + * used to create the ksk) * * This function calls a wrapper to a device kernel that performs the keyswitch * - num_samples blocks of threads are launched @@ -44,6 +43,9 @@ void cuda_keyswitch_lwe_ciphertext_vector_64( lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples); } +/* Perform functional packing keyswitch on a batch of 32 bits input LWE + * ciphertexts. See the equivalent function on 64 bit inputs for more details. + */ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out, void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension, @@ -60,6 +62,27 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out, output_glwe_dimension, output_polynomial_size, base_log, level_count, number_of_input_lwe, number_of_keys); } + +/* Perform functional packing keyswitch on a batch of 64 bits input LWE + * ciphertexts. + * + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `glwe_array_out`: output batch of keyswitched ciphertexts + * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing + * lwe_dimension_in mask values + 1 body value + * - `fp_ksk_array`: the functional packing keyswitch keys to be used in the + * operation + * - `base log`: the log of the base used in the decomposition (should be the + * one used to create the ksk) + * - `level_count`: the number of levels used in the decomposition (should be + * the one used to create the fp_ksks). + * - `number_of_input_lwe`: the number of inputs + * - `number_of_keys`: the number of fp_ksks + * + * This function calls a wrapper to a device kernel that performs the functional + * packing keyswitch. + */ void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out, void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension, diff --git a/src/multiplication.cu b/src/multiplication.cu index 397c16255..8b1d0371b 100644 --- a/src/multiplication.cu +++ b/src/multiplication.cu @@ -1,5 +1,9 @@ #include "multiplication.cuh" +/* + * Perform the multiplication of a u32 input LWE ciphertext vector with a u32 + * cleartext vector. See the equivalent operation on u64 data for more details. + */ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *cleartext_array_in, uint32_t input_lwe_dimension, @@ -11,6 +15,34 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32( static_cast(cleartext_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } +/* + * Perform the multiplication of a u64 input LWE ciphertext vector with a u64 + * input cleartext vector. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - `lwe_array_out` is an array of size + * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have + * been allocated on the GPU before calling this function, and that will hold + * the result of the computation. + * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have + * been allocated and initialized before calling this function. It has the same + * size as the output array. + * - `cleartext_array_in` is the cleartext vector used as input, it should have + * been allocated and initialized before calling this function. It should be of + * size `input_lwe_ciphertext_count`. + * - `input_lwe_dimension` is the number of mask elements in the input and + * output LWE ciphertext vectors + * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the + * input LWE ciphertext vector, as well as in the output. It is also the number + * of cleartexts in the input cleartext vector. + * + * Each cleartext of the input cleartext vector is multiplied to the mask and + * body of the corresponding LWE ciphertext in the LWE ciphertext vector. The + * result of the operation is stored in the output LWE ciphertext vector. The + * two input vectors are unchanged. This function is a wrapper to a device + * function that performs the operation on the GPU. + */ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *cleartext_array_in, uint32_t input_lwe_dimension, diff --git a/src/negation.cu b/src/negation.cu index bb5ebcf6f..1f0f10d4e 100644 --- a/src/negation.cu +++ b/src/negation.cu @@ -1,5 +1,9 @@ #include "negation.cuh" +/* + * Perform the negation of a u32 input LWE ciphertext vector. + * See the equivalent operation on u64 ciphertexts for more details. + */ void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, @@ -10,6 +14,29 @@ void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index, static_cast(lwe_array_in), input_lwe_dimension, input_lwe_ciphertext_count); } + +/* + * Perform the negation of a u64 input LWE ciphertext vector. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - `lwe_array_out` is an array of size + * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have + * been allocated on the GPU before calling this function, and that will hold + * the result of the computation. + * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have + * been allocated and initialized before calling this function. It has the same + * size as the output array. + * - `input_lwe_dimension` is the number of mask elements in the two input and + * in the output ciphertext vectors + * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each + * input LWE ciphertext vector, as well as in the output. + * + * Each element (mask element or body) of the input LWE ciphertext vector is + * negated. The result is stored in the output LWE ciphertext vector. The input + * LWE ciphertext vector is left unchanged. This function is a wrapper to a + * device function that performs the operation on the GPU. + */ void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu index 1f5499698..9f148f9fd 100644 --- a/src/vertical_packing.cu +++ b/src/vertical_packing.cu @@ -1,5 +1,9 @@ #include "vertical_packing.cuh" +/* + * Perform cmux tree on a batch of 32-bit input GGSW ciphertexts. + * Check the equivalent function for 64-bit inputs for more details. + */ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -55,6 +59,33 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out, } } +/* + * Perform Cmux tree on a batch of 64-bit input GGSW ciphertexts + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - 'glwe_array_out' output batch of GLWE buffer for Cmux tree, 'tau' GLWE's + * will be the output of the function + * - 'ggsw_in' batch of input GGSW ciphertexts, function expects 'r' GGSW + * ciphertexts as input. + * - 'lut_vector' batch of test vectors (LUTs) there should be 2^r LUTs + * inside 'lut_vector' parameter + * - 'glwe_dimension' GLWE dimension, supported values: {1} + * - 'polynomial_size' size of the test polynomial, supported values: {512, + * 1024, 2048, 4096, 8192} + * - 'base_log' base log parameter for cmux block + * - 'level_count' decomposition level for cmux block + * - 'r' number of input GGSW ciphertexts + * - 'tau' number of input LWE ciphertext which were used to generate GGSW + * ciphertexts stored in 'ggsw_in', it is also an amount of output GLWE + * ciphertexts + * - 'max_shared_memory' maximum shared memory amount to be used for cmux + * kernel + * + * This function calls a wrapper to a device kernel that performs the + * Cmux tree. The kernel is templatized based on integer discretization and + * polynomial degree. + */ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, void *ggsw_in, void *lut_vector, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, @@ -110,6 +141,29 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out, } } +/* + * Performs blind rotation on batch of 64-bit input ggsw ciphertexts + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - 'lwe_out' batch of output lwe ciphertexts, there should be 'tau' + * ciphertexts inside 'lwe_out' + * - 'ggsw_in' batch of input ggsw ciphertexts, function expects 'mbr_size' + * ggsw ciphertexts inside 'ggsw_in' + * - 'lut_vector' list of test vectors, function expects 'tau' test vectors + * inside 'lut_vector' parameter + * - 'glwe_dimension' glwe dimension, supported values : {1} + * - 'polynomial_size' size of test polynomial supported sizes: {512, 1024, + * 2048, 4096, 8192} + * - 'base_log' base log parameter + * - 'l_gadget' decomposition level + * - 'max_shared_memory' maximum number of shared memory to be used in + * device functions(kernels) + * + * This function calls a wrapper to a device kernel that performs the + * blind rotation and sample extraction. The kernel is templatized based on + * integer discretization and polynomial degree. + */ void cuda_blind_rotate_and_sample_extraction_64( void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in, void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension, diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu index 57fcb15c7..c144a3654 100644 --- a/src/wop_bootstrap.cu +++ b/src/wop_bootstrap.cu @@ -1,5 +1,32 @@ #include "wop_bootstrap.cuh" +/* + * Entry point for cuda circuit bootstrap + vertical packing for batches of + * input 64 bit LWE ciphertexts. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - 'lwe_array_out' list of output lwe ciphertexts + * - 'lwe_array_in' list of input lwe_ciphertexts + * - 'fourier_bsk' bootstrapping key in fourier domain, expected half size + * compressed complex key. + * - 'cbs_fpksk' list of private functional packing keyswitch keys + * - 'lut_vector' list of test vectors + * - 'polynomial_size' size of the test polynomial, supported sizes: + * {512, 1024, 2048, 4096, 8192} + * - 'glwe_dimension' supported dimensions: {1} + * - 'lwe_dimension' dimension of input LWE ciphertexts + * - 'level_count_bsk' decomposition level for bootstrapping + * - 'base_log_bsk' base log parameter for bootstrapping + * - 'level_count_pksk' decomposition level for fp-keyswitch + * - 'base_log_pksk' base log parameter for fp-keyswitch + * - 'level_count_cbs' level of circuit bootstrap + * - 'base_log_cbs' base log parameter for circuit bootstrap + * - 'number_of_inputs' number of input LWE ciphertexts + * - 'max_shared_memory' maximum shared memory amount to be used in + * bootstrapping kernel + * + */ void cuda_circuit_bootstrap_vertical_packing_64( void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *fourier_bsk, void *cbs_fpksk, void *lut_vector, @@ -77,6 +104,40 @@ void cuda_circuit_bootstrap_vertical_packing_64( } } +/* + * Entry point for entire without padding programmable bootstrap on 64 bit input + * LWE ciphertexts. + * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel + * launch + * - `gpu_index` is the index of the GPU to be used in the kernel launch + * - 'lwe_array_out' list of output lwe ciphertexts + * - 'lwe_array_in' list of input lwe_ciphertexts + * - 'lut_vector' list of test vectors + * - 'fourier_bsk' bootstrapping key in fourier domain, expected half size + * compressed complex key. + * - 'ksk' keyswitch key to use inside extract bits block + * - 'cbs_fpksk' list of fp-keyswitch keys + * - 'glwe_dimension' supported dimensions: {1} + * - 'lwe_dimension' dimension of input lwe ciphertexts + * - 'polynomial_size' size of the test polynomial, supported sizes: + * {512, 1024, 2048, 4096, 8192} + * - 'base_log_bsk' base log parameter for bootstrapping + * - 'level_count_bsk' decomposition level for bootstrapping + * - 'base_log_ksk' base log parameter for keyswitch + * - 'level_count_ksk' decomposition level for keyswitch + * - 'base_log_pksk' base log parameter for fp-keyswitch + * - 'level_count_pksk' decomposition level for fp-keyswitch + * - 'base_log_cbs' base log parameter for circuit bootstrap + * - 'level_count_cbs' level of circuit bootstrap + * - 'number_of_bits_of_message_including_padding' number of bits to extract + * from each input lwe ciphertext including padding bit + * - 'number_of_bits_to_extract' number of bits to extract + * from each input lwe ciphertext without padding bit + * - 'number_of_inputs' number of input lwe ciphertexts + * - 'max_shared_memory' maximum shared memory amount to be used in + * bootstrapping kernel + * + */ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in, void *lut_vector, void *fourier_bsk, void *ksk, void *cbs_fpksk, uint32_t glwe_dimension,