mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
docs(cuda): add Rust doc for all concrete-cuda entry points
This commit is contained in:
committed by
Agnès Leroy
parent
8327cd7fff
commit
bc90576454
@@ -1,5 +1,9 @@
|
||||
#include "addition.cuh"
|
||||
|
||||
/*
|
||||
* Perform the addition of two u32 input LWE ciphertext vectors.
|
||||
* See the equivalent operation on u64 ciphertexts for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
@@ -12,6 +16,33 @@ void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the addition of two u64 input LWE ciphertext vectors.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
|
||||
* should have been allocated and initialized before calling this function. It
|
||||
* has the same size as the output array.
|
||||
* - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
|
||||
* should have been allocated and initialized before calling this function. It
|
||||
* has the same size as the output array.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the two input and
|
||||
* in the output ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
|
||||
* input LWE ciphertext vector, as well as in the output.
|
||||
*
|
||||
* Each element (mask element or body) of the input LWE ciphertext vector 1 is
|
||||
* added to the corresponding element in the input LWE ciphertext 2. The result
|
||||
* is stored in the output LWE ciphertext vector. The two input LWE ciphertext
|
||||
* vectors are left unchanged. This function is a wrapper to a device function
|
||||
* that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in_1,
|
||||
@@ -24,6 +55,10 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u32 input LWE ciphertext vector with a u32
|
||||
* plaintext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
@@ -35,6 +70,34 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
|
||||
static_cast<uint32_t *>(plaintext_array_in),
|
||||
input_lwe_dimension, input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
|
||||
* plaintext vector.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It has the same
|
||||
* size as the output array.
|
||||
* - `plaintext_array_in` is the plaintext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It should be of
|
||||
* size `input_lwe_ciphertext_count`.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the input and
|
||||
* output LWE ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
|
||||
* input LWE ciphertext vector, as well as in the output. It is also the number
|
||||
* of plaintexts in the input plaintext vector.
|
||||
*
|
||||
* Each plaintext of the input plaintext vector is added to the body of the
|
||||
* corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
|
||||
* operation is stored in the output LWE ciphertext vector. The two input
|
||||
* vectors are unchanged. This function is a wrapper to a device function that
|
||||
* performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *plaintext_array_in, uint32_t input_lwe_dimension,
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
#include "bit_extraction.cuh"
|
||||
|
||||
/* Perform bit extract on a batch of 32 bit LWE ciphertexts.
|
||||
* See the corresponding function on 64 bit LWE ciphertexts for more details.
|
||||
*/
|
||||
void cuda_extract_bits_32(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
void *lwe_array_in, void *lwe_array_in_buffer,
|
||||
@@ -97,6 +100,51 @@ void cuda_extract_bits_32(
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bit extract on a batch of 64 bit lwe ciphertexts.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'number_of_bits' will be extracted from each ciphertext
|
||||
* starting at the bit number 'delta_log' (0-indexed) included.
|
||||
* Output bits are ordered from the MSB to LSB. Every extracted bit is
|
||||
* represented as an LWE ciphertext, containing the encryption of the bit scaled
|
||||
* by q/2.
|
||||
* - 'list_lwe_array_out' output batch LWE ciphertexts for each bit of every
|
||||
* input ciphertext
|
||||
* - 'lwe_array_in' batch of input LWE ciphertexts, with size -
|
||||
* ('lwe_dimension_in' + 1) * number_of_samples * sizeof(u64)
|
||||
* The following 5 parameters are used during calculations, they are not actual
|
||||
* inputs of the function they are just allocated memory for calculation
|
||||
* process, like this, memory can be allocated once and can be used as much
|
||||
* as needed for different calls of extract_bit function.
|
||||
* - 'lwe_array_in_buffer' same size as 'lwe_array_in'
|
||||
* - 'lwe_array_in_shifted_buffer' same size as 'lwe_array_in'
|
||||
* - 'lwe_array_out_ks_buffer' with size:
|
||||
* ('lwe_dimension_out' + 1) * number_of_samples * sizeof(u64)
|
||||
* - 'lwe_array_out_pbs_buffer' same size as 'lwe_array_in'
|
||||
* - 'lut_pbs' with size:
|
||||
* (glwe_dimension + 1) * (lwe_dimension_in + 1) * sizeof(u64)
|
||||
* The other inputs are:
|
||||
* - 'lut_vector_indexes' stores the index corresponding to which test
|
||||
* vector to use
|
||||
* - 'ksk' keyswitch key
|
||||
* - 'fourier_bsk' complex compressed bsk in fourier domain
|
||||
* - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
|
||||
* dimensions are: {512, 1024,2048, 4096, 8192}
|
||||
* - 'lwe_dimension_out' output LWE ciphertext dimension
|
||||
* - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported
|
||||
* for now
|
||||
* - 'base_log_bsk' base_log for bootstrapping
|
||||
* - 'level_count_bsk' decomposition level count for bootstrapping
|
||||
* - 'base_log_ksk' base_log for keyswitch
|
||||
* - 'level_count_ksk' decomposition level for keyswitch
|
||||
* - 'number_of_samples' number of input LWE ciphertexts
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function will call corresponding template of wrapper host function which
|
||||
* will manage the calls of device functions.
|
||||
*/
|
||||
void cuda_extract_bits_64(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
void *lwe_array_in, void *lwe_array_in_buffer,
|
||||
|
||||
@@ -11,10 +11,12 @@
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// only works for big lwe for ks+bs case
|
||||
// state_lwe_buffer is copied from big lwe input
|
||||
// shifted_lwe_buffer is scalar multiplication of lwe input
|
||||
// blockIdx.x refers to input ciphertext id
|
||||
/*
|
||||
* Function copies batch lwe input to two different buffers,
|
||||
* one is shifted by value
|
||||
* one is copied without any modification
|
||||
* works for ciphertexts with sizes supported by params::degree
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
|
||||
Torus *src, Torus value) {
|
||||
@@ -37,8 +39,10 @@ __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
|
||||
}
|
||||
}
|
||||
|
||||
// only works for small lwe in ks+bs case
|
||||
// function copies lwe when length is not a power of two
|
||||
/*
|
||||
* Function copies batch of lwe to lwe when size is not supported by
|
||||
* params::degree
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
|
||||
uint32_t number_of_bits, uint32_t lwe_id) {
|
||||
@@ -62,24 +66,27 @@ __global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
|
||||
cur_dst[tid] = cur_src[tid];
|
||||
}
|
||||
|
||||
// only used in extract bits for one ciphertext
|
||||
// should be called with one block and one thread
|
||||
// NOTE: check if putting this functionality in copy_small_lwe or
|
||||
// fill_pbs_lut vector is faster
|
||||
/*
|
||||
* Function used to wrapping add value on the body of ciphertexts,
|
||||
* should be called with blocksize.x = 1;
|
||||
* blickIdx.x refers id of ciphertext
|
||||
* NOTE: check if putting thi functionality in copy_small_lwe or fill_pbs_lut
|
||||
* is faster
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
|
||||
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
|
||||
}
|
||||
|
||||
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
|
||||
// if the extracted bit was 0 and 1 in the other case
|
||||
//
|
||||
// Remove the extracted bit from the state LWE to get a 0 at the extracted bit
|
||||
// location.
|
||||
//
|
||||
// Shift on padding bit for next iteration, that's why
|
||||
// alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
|
||||
// instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
|
||||
/*
|
||||
* Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
|
||||
* if the extracted bit was 0 and 1 in the other case
|
||||
* Remove the extracted bit from the state LWE to get a 0 at the extracted bit
|
||||
* location.
|
||||
* Shift on padding bit for next iteration, that's why
|
||||
* alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
|
||||
* instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
|
||||
Torus *pbs_lwe_array_out, Torus add_value,
|
||||
@@ -104,9 +111,11 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
|
||||
}
|
||||
}
|
||||
|
||||
// Fill lut(only body) for the current bit (equivalent to trivial encryption as
|
||||
// mask is 0s)
|
||||
// The LUT is filled with value
|
||||
/*
|
||||
* Fill lut(only body) for the current bit, equivalent to trivial encryption as
|
||||
* msk is 0s
|
||||
* blockIdx.x refers id of lut vector
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
|
||||
|
||||
@@ -119,6 +128,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Host function for cuda extract bits.
|
||||
* it executes device functions in specific order and manages
|
||||
* parallelism
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_extract_bits(
|
||||
void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
|
||||
@@ -137,6 +151,7 @@ __host__ void host_extract_bits(
|
||||
int blocks = 1;
|
||||
int threads = params::degree / params::opt;
|
||||
|
||||
// shift lwe on padding bit and copy in new buffer
|
||||
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
|
||||
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
|
||||
1ll << (ciphertext_n_bits - delta_log - 1));
|
||||
@@ -157,11 +172,15 @@ __host__ void host_extract_bits(
|
||||
break;
|
||||
}
|
||||
|
||||
// Add q/4 to center the error while computing a negacyclic LUT
|
||||
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
|
||||
lwe_dimension_out,
|
||||
1ll << (ciphertext_n_bits - 2));
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
// Fill lut for the current bit (equivalent to trivial encryption as mask is
|
||||
// 0s) The LUT is filled with -alpha in each coefficient where alpha =
|
||||
// delta*2^{bit_idx-1}
|
||||
fill_lut_body_for_current_bit<Torus, params>
|
||||
<<<blocks, threads, 0, *stream>>>(
|
||||
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
|
||||
@@ -173,6 +192,8 @@ __host__ void host_extract_bits(
|
||||
lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
|
||||
number_of_samples, 1, max_shared_memory);
|
||||
|
||||
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
|
||||
// of 0 if the extracted bit was 0 and 1 in the other case
|
||||
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
|
||||
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
|
||||
lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
|
||||
|
||||
@@ -1,60 +1,7 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/* Perform bootstrapping on a batch of input LWE ciphertexts
|
||||
*
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* Note: it is necessary to generate (k+1).k.l.N.n
|
||||
* uniformly random coefficients for the zero encryptions
|
||||
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - q: number of bytes in the integer representation (32 or 64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into level_count levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
/* Perform the programmable bootstrapping on a batch of input u32 LWE
|
||||
* ciphertexts. See the corresponding operation on 64 bits for more details.
|
||||
*/
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
@@ -115,6 +62,71 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform the programmable bootstrapping on a batch of input u64 LWE
|
||||
* ciphertexts. This functions performs best for large numbers of inputs (> 10).
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector of lut_vector to use for each LWE input in
|
||||
* lwe_array_in
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
|
||||
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - lwe_idx: the index of the LWE input to consider for the GPU of index
|
||||
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
|
||||
* the input LWE array is copied to each GPU, but the whole LUT array is copied
|
||||
* (because the case when the number of LUTs is smaller than the number of input
|
||||
* LWEs is not trivial to take into account in the data repartition on the
|
||||
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
|
||||
* input in the LUT array `lut_vector`.
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into level_count levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
|
||||
@@ -1,60 +1,11 @@
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
|
||||
/* Perform bootstrapping on a batch of input LWE ciphertexts
|
||||
*
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* Note: it is necessary to generate (k+1).k.l.N.n
|
||||
* uniformly random coefficients for the zero encryptions
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - q: number of bytes in the integer representation (32 or 64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into level_count levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
|
||||
* This function performs best for small numbers of inputs. Beyond a certain
|
||||
* number of inputs (the exact number depends on the cryptographic parameters),
|
||||
* the kernel cannot be launched and it is necessary to split the kernel call
|
||||
* into several calls on smaller batches of inputs. For more details on this
|
||||
* operation, head on to the equivalent u64 operation.
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
@@ -124,6 +75,82 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
|
||||
* This function performs best for small numbers of inputs. Beyond a certain
|
||||
* number of inputs (the exact number depends on the cryptographic parameters),
|
||||
* the kernel cannot be launched and it is necessary to split the kernel call
|
||||
* into several calls on smaller batches of inputs.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
|
||||
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
|
||||
* this parameter.
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - level_count: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - lwe_idx: the index of the LWE input to consider for the GPU of index
|
||||
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
|
||||
* the input LWE array is copied to each GPU, but the whole LUT array is copied
|
||||
* (because the case when the number of LUTs is smaller than the number of input
|
||||
* LWEs is not trivial to take into account in the data repartition on the
|
||||
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
|
||||
* input in the LUT array `lut_vector`.
|
||||
* - 'max_shared_memory' maximum amount of shared memory to be used inside
|
||||
* device functions
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
|
||||
* launched, where each thread is going to handle one or more polynomial
|
||||
* coefficients at each stage, for a given level of decomposition, either for
|
||||
* the LUT mask or its body:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - get the decomposition for the current level
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary (some
|
||||
* synchronizations happen at the block level, some happen between blocks, using
|
||||
* cooperative groups).
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
#include "circuit_bootstrap.cuh"
|
||||
|
||||
/*
|
||||
* Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
|
||||
* Head out to the equivalent operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_circuit_bootstrap_32(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
|
||||
@@ -88,6 +92,42 @@ void cuda_circuit_bootstrap_32(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform circuit bootstrapping on a batch of 64 bit input LWE ciphertexts.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'ggsw_out' output batch of ggsw with size:
|
||||
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
|
||||
* polynomial_size * sizeof(u64)
|
||||
* - 'lwe_array_in' input batch of lwe ciphertexts, with size:
|
||||
* 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64)
|
||||
* - 'fourier_bsk' bootstrapping key in fourier domain with size:
|
||||
* 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
|
||||
* 'polynomial_size' / 2 * sizeof(double2)
|
||||
* - 'fp_ksk_array' batch of fp-keyswitch keys with size:
|
||||
* ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
|
||||
* 'polynomial_size' * sizeof(u64)
|
||||
* The following 5 parameters are used during calculations, they are not actual
|
||||
* inputs of the function, they are just allocated memory for calculation
|
||||
* process, like this, memory can be allocated once and can be used as much
|
||||
* as needed for different calls of circuit_bootstrap function
|
||||
* - 'lwe_array_in_shifted_buffer' with size:
|
||||
* 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64)
|
||||
* - 'lut_vector' with size:
|
||||
* 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64)
|
||||
* - 'lut_vector_indexes' stores the index corresponding to which test
|
||||
* vector to use
|
||||
* - 'lwe_array_out_pbs_buffer' with size
|
||||
* 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64)
|
||||
* - 'lwe_array_in_fp_ks_buffer' with size
|
||||
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) *
|
||||
* ('polynomial_size' + 1) * sizeof(u64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* circuit bootstrap. The kernel is templatized based on integer discretization
|
||||
* and polynomial degree.
|
||||
*/
|
||||
void cuda_circuit_bootstrap_64(
|
||||
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
|
||||
|
||||
@@ -10,9 +10,11 @@
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// works for lwe with generic sizes
|
||||
// shifted_lwe_buffer is scalar multiplication of lwe input
|
||||
// blockIdx.x refers to input ciphertext id
|
||||
/*
|
||||
* scalar multiplication to value for batch of lwe_ciphertext
|
||||
* works for any size of lwe input
|
||||
* blockIdx.x refers to input ciphertext it
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
|
||||
size_t lwe_size) {
|
||||
@@ -35,9 +37,14 @@ __global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
|
||||
cur_dst[tid] = cur_src[tid] * value;
|
||||
}
|
||||
|
||||
// Fill lut (equivalent to trivial encryption as mask is 0s)
|
||||
// The LUT is filled with -alpha in each coefficient where
|
||||
// alpha = 2^{log(q) - 1 - base_log * level}
|
||||
/*
|
||||
* Fill lut, equivalent to trivial encryption as mask is 0s.
|
||||
* The LUT is filled with -alpha in each coefficient where
|
||||
* alpha = 2^{log(q) - 1 - base_log * level}
|
||||
* blockIdx.x refers to lut id
|
||||
* value is not passed and calculated inside function because lut id is one
|
||||
* of the variable.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
|
||||
uint32_t base_log_cbs) {
|
||||
@@ -55,6 +62,19 @@ __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
|
||||
* each of the input ciphertext from lwe_src is copied (glwe_dimension + 1)
|
||||
* times inside lwe_dst, and then value is added to the body.
|
||||
* blockIdx.x refers to destination lwe ciphertext id: 'dst_lwe_id'
|
||||
* 'src_lwe_id' = 'dst_lwe_id' / (glwe_dimension + 1)
|
||||
*
|
||||
* example: glwe_dimension = 1
|
||||
* src_0 ... src_n
|
||||
* / \ / \
|
||||
* / \ / \
|
||||
* dst_0 dst_1 dst_2n dst_2n+1
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
|
||||
uint32_t ciphertext_n_bits,
|
||||
@@ -77,6 +97,11 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Host function for cuda circuit bootstrap.
|
||||
* It executes device functions in specific order and manages
|
||||
* parallelism
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_circuit_bootstrap(
|
||||
void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
|
||||
|
||||
@@ -4,15 +4,8 @@
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
|
||||
*
|
||||
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
|
||||
* Head out to the equivalent operation on 64 bits for more details.
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
@@ -24,12 +17,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
|
||||
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
|
||||
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
|
||||
* lwe_dimension_in mask values + 1 body value
|
||||
* - ksk: the keyswitch key to be used in the operation
|
||||
* - base log: the log of the base used in the decomposition (should be the one
|
||||
* used to create the ksk)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
@@ -44,6 +43,9 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/* Perform functional packing keyswitch on a batch of 32 bits input LWE
|
||||
* ciphertexts. See the equivalent function on 64 bit inputs for more details.
|
||||
*/
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array,
|
||||
uint32_t input_lwe_dimension,
|
||||
@@ -60,6 +62,27 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
number_of_input_lwe, number_of_keys);
|
||||
}
|
||||
|
||||
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
|
||||
* ciphertexts.
|
||||
*
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `glwe_array_out`: output batch of keyswitched ciphertexts
|
||||
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
|
||||
* lwe_dimension_in mask values + 1 body value
|
||||
* - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
|
||||
* operation
|
||||
* - `base log`: the log of the base used in the decomposition (should be the
|
||||
* one used to create the ksk)
|
||||
* - `level_count`: the number of levels used in the decomposition (should be
|
||||
* the one used to create the fp_ksks).
|
||||
* - `number_of_input_lwe`: the number of inputs
|
||||
* - `number_of_keys`: the number of fp_ksks
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the functional
|
||||
* packing keyswitch.
|
||||
*/
|
||||
void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
|
||||
void *lwe_array_in, void *fp_ksk_array,
|
||||
uint32_t input_lwe_dimension,
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
#include "multiplication.cuh"
|
||||
|
||||
/*
|
||||
* Perform the multiplication of a u32 input LWE ciphertext vector with a u32
|
||||
* cleartext vector. See the equivalent operation on u64 data for more details.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
@@ -11,6 +15,34 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
|
||||
* input cleartext vector.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It has the same
|
||||
* size as the output array.
|
||||
* - `cleartext_array_in` is the cleartext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It should be of
|
||||
* size `input_lwe_ciphertext_count`.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the input and
|
||||
* output LWE ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
|
||||
* input LWE ciphertext vector, as well as in the output. It is also the number
|
||||
* of cleartexts in the input cleartext vector.
|
||||
*
|
||||
* Each cleartext of the input cleartext vector is multiplied to the mask and
|
||||
* body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
|
||||
* result of the operation is stored in the output LWE ciphertext vector. The
|
||||
* two input vectors are unchanged. This function is a wrapper to a device
|
||||
* function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
#include "negation.cuh"
|
||||
|
||||
/*
|
||||
* Perform the negation of a u32 input LWE ciphertext vector.
|
||||
* See the equivalent operation on u64 ciphertexts for more details.
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
@@ -10,6 +14,29 @@ void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the negation of a u64 input LWE ciphertext vector.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - `lwe_array_out` is an array of size
|
||||
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
|
||||
* been allocated on the GPU before calling this function, and that will hold
|
||||
* the result of the computation.
|
||||
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
|
||||
* been allocated and initialized before calling this function. It has the same
|
||||
* size as the output array.
|
||||
* - `input_lwe_dimension` is the number of mask elements in the two input and
|
||||
* in the output ciphertext vectors
|
||||
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
|
||||
* input LWE ciphertext vector, as well as in the output.
|
||||
*
|
||||
* Each element (mask element or body) of the input LWE ciphertext vector is
|
||||
* negated. The result is stored in the output LWE ciphertext vector. The input
|
||||
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
|
||||
* device function that performs the operation on the GPU.
|
||||
*/
|
||||
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
|
||||
void *lwe_array_out,
|
||||
void *lwe_array_in,
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
#include "vertical_packing.cuh"
|
||||
|
||||
/*
|
||||
* Perform cmux tree on a batch of 32-bit input GGSW ciphertexts.
|
||||
* Check the equivalent function for 64-bit inputs for more details.
|
||||
*/
|
||||
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -55,6 +59,33 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform Cmux tree on a batch of 64-bit input GGSW ciphertexts
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'glwe_array_out' output batch of GLWE buffer for Cmux tree, 'tau' GLWE's
|
||||
* will be the output of the function
|
||||
* - 'ggsw_in' batch of input GGSW ciphertexts, function expects 'r' GGSW
|
||||
* ciphertexts as input.
|
||||
* - 'lut_vector' batch of test vectors (LUTs) there should be 2^r LUTs
|
||||
* inside 'lut_vector' parameter
|
||||
* - 'glwe_dimension' GLWE dimension, supported values: {1}
|
||||
* - 'polynomial_size' size of the test polynomial, supported values: {512,
|
||||
* 1024, 2048, 4096, 8192}
|
||||
* - 'base_log' base log parameter for cmux block
|
||||
* - 'level_count' decomposition level for cmux block
|
||||
* - 'r' number of input GGSW ciphertexts
|
||||
* - 'tau' number of input LWE ciphertext which were used to generate GGSW
|
||||
* ciphertexts stored in 'ggsw_in', it is also an amount of output GLWE
|
||||
* ciphertexts
|
||||
* - 'max_shared_memory' maximum shared memory amount to be used for cmux
|
||||
* kernel
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* Cmux tree. The kernel is templatized based on integer discretization and
|
||||
* polynomial degree.
|
||||
*/
|
||||
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -110,6 +141,29 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Performs blind rotation on batch of 64-bit input ggsw ciphertexts
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'lwe_out' batch of output lwe ciphertexts, there should be 'tau'
|
||||
* ciphertexts inside 'lwe_out'
|
||||
* - 'ggsw_in' batch of input ggsw ciphertexts, function expects 'mbr_size'
|
||||
* ggsw ciphertexts inside 'ggsw_in'
|
||||
* - 'lut_vector' list of test vectors, function expects 'tau' test vectors
|
||||
* inside 'lut_vector' parameter
|
||||
* - 'glwe_dimension' glwe dimension, supported values : {1}
|
||||
* - 'polynomial_size' size of test polynomial supported sizes: {512, 1024,
|
||||
* 2048, 4096, 8192}
|
||||
* - 'base_log' base log parameter
|
||||
* - 'l_gadget' decomposition level
|
||||
* - 'max_shared_memory' maximum number of shared memory to be used in
|
||||
* device functions(kernels)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* blind rotation and sample extraction. The kernel is templatized based on
|
||||
* integer discretization and polynomial degree.
|
||||
*/
|
||||
void cuda_blind_rotate_and_sample_extraction_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
|
||||
void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,
|
||||
|
||||
@@ -1,5 +1,32 @@
|
||||
#include "wop_bootstrap.cuh"
|
||||
|
||||
/*
|
||||
* Entry point for cuda circuit bootstrap + vertical packing for batches of
|
||||
* input 64 bit LWE ciphertexts.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'lwe_array_out' list of output lwe ciphertexts
|
||||
* - 'lwe_array_in' list of input lwe_ciphertexts
|
||||
* - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
|
||||
* compressed complex key.
|
||||
* - 'cbs_fpksk' list of private functional packing keyswitch keys
|
||||
* - 'lut_vector' list of test vectors
|
||||
* - 'polynomial_size' size of the test polynomial, supported sizes:
|
||||
* {512, 1024, 2048, 4096, 8192}
|
||||
* - 'glwe_dimension' supported dimensions: {1}
|
||||
* - 'lwe_dimension' dimension of input LWE ciphertexts
|
||||
* - 'level_count_bsk' decomposition level for bootstrapping
|
||||
* - 'base_log_bsk' base log parameter for bootstrapping
|
||||
* - 'level_count_pksk' decomposition level for fp-keyswitch
|
||||
* - 'base_log_pksk' base log parameter for fp-keyswitch
|
||||
* - 'level_count_cbs' level of circuit bootstrap
|
||||
* - 'base_log_cbs' base log parameter for circuit bootstrap
|
||||
* - 'number_of_inputs' number of input LWE ciphertexts
|
||||
* - 'max_shared_memory' maximum shared memory amount to be used in
|
||||
* bootstrapping kernel
|
||||
*
|
||||
*/
|
||||
void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
|
||||
@@ -77,6 +104,40 @@ void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Entry point for entire without padding programmable bootstrap on 64 bit input
|
||||
* LWE ciphertexts.
|
||||
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
|
||||
* launch
|
||||
* - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
* - 'lwe_array_out' list of output lwe ciphertexts
|
||||
* - 'lwe_array_in' list of input lwe_ciphertexts
|
||||
* - 'lut_vector' list of test vectors
|
||||
* - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
|
||||
* compressed complex key.
|
||||
* - 'ksk' keyswitch key to use inside extract bits block
|
||||
* - 'cbs_fpksk' list of fp-keyswitch keys
|
||||
* - 'glwe_dimension' supported dimensions: {1}
|
||||
* - 'lwe_dimension' dimension of input lwe ciphertexts
|
||||
* - 'polynomial_size' size of the test polynomial, supported sizes:
|
||||
* {512, 1024, 2048, 4096, 8192}
|
||||
* - 'base_log_bsk' base log parameter for bootstrapping
|
||||
* - 'level_count_bsk' decomposition level for bootstrapping
|
||||
* - 'base_log_ksk' base log parameter for keyswitch
|
||||
* - 'level_count_ksk' decomposition level for keyswitch
|
||||
* - 'base_log_pksk' base log parameter for fp-keyswitch
|
||||
* - 'level_count_pksk' decomposition level for fp-keyswitch
|
||||
* - 'base_log_cbs' base log parameter for circuit bootstrap
|
||||
* - 'level_count_cbs' level of circuit bootstrap
|
||||
* - 'number_of_bits_of_message_including_padding' number of bits to extract
|
||||
* from each input lwe ciphertext including padding bit
|
||||
* - 'number_of_bits_to_extract' number of bits to extract
|
||||
* from each input lwe ciphertext without padding bit
|
||||
* - 'number_of_inputs' number of input lwe ciphertexts
|
||||
* - 'max_shared_memory' maximum shared memory amount to be used in
|
||||
* bootstrapping kernel
|
||||
*
|
||||
*/
|
||||
void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *lwe_array_in, void *lut_vector, void *fourier_bsk,
|
||||
void *ksk, void *cbs_fpksk, uint32_t glwe_dimension,
|
||||
|
||||
Reference in New Issue
Block a user