docs(cuda): add Rust doc for all concrete-cuda entry points

This commit is contained in:
Beka Barbakadze
2023-01-11 16:14:33 +04:00
committed by Agnès Leroy
parent 8327cd7fff
commit bc90576454
13 changed files with 585 additions and 151 deletions

View File

@@ -1,5 +1,9 @@
#include "addition.cuh"
/*
* Perform the addition of two u32 input LWE ciphertext vectors.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
@@ -12,6 +16,33 @@ void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of two u64 input LWE ciphertext vectors.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
* should have been allocated and initialized before calling this function. It
* has the same size as the output array.
* - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
* should have been allocated and initialized before calling this function. It
* has the same size as the output array.
* - `input_lwe_dimension` is the number of mask elements in the two input and
* in the output ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
* input LWE ciphertext vector, as well as in the output.
*
* Each element (mask element or body) of the input LWE ciphertext vector 1 is
* added to the corresponding element in the input LWE ciphertext 2. The result
* is stored in the output LWE ciphertext vector. The two input LWE ciphertext
* vectors are left unchanged. This function is a wrapper to a device function
* that performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in_1,
@@ -24,6 +55,10 @@ void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u32 input LWE ciphertext vector with a u32
* plaintext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,
@@ -35,6 +70,34 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
static_cast<uint32_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
* plaintext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `plaintext_array_in` is the plaintext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of plaintexts in the input plaintext vector.
*
* Each plaintext of the input plaintext vector is added to the body of the
* corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
* operation is stored in the output LWE ciphertext vector. The two input
* vectors are unchanged. This function is a wrapper to a device function that
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *plaintext_array_in, uint32_t input_lwe_dimension,

View File

@@ -1,5 +1,8 @@
#include "bit_extraction.cuh"
/* Perform bit extract on a batch of 32 bit LWE ciphertexts.
* See the corresponding function on 64 bit LWE ciphertexts for more details.
*/
void cuda_extract_bits_32(
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
void *lwe_array_in, void *lwe_array_in_buffer,
@@ -97,6 +100,51 @@ void cuda_extract_bits_32(
}
}
/* Perform bit extract on a batch of 64 bit lwe ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'number_of_bits' will be extracted from each ciphertext
* starting at the bit number 'delta_log' (0-indexed) included.
* Output bits are ordered from the MSB to LSB. Every extracted bit is
* represented as an LWE ciphertext, containing the encryption of the bit scaled
* by q/2.
* - 'list_lwe_array_out' output batch LWE ciphertexts for each bit of every
* input ciphertext
* - 'lwe_array_in' batch of input LWE ciphertexts, with size -
* ('lwe_dimension_in' + 1) * number_of_samples * sizeof(u64)
* The following 5 parameters are used during calculations, they are not actual
* inputs of the function they are just allocated memory for calculation
* process, like this, memory can be allocated once and can be used as much
* as needed for different calls of extract_bit function.
* - 'lwe_array_in_buffer' same size as 'lwe_array_in'
* - 'lwe_array_in_shifted_buffer' same size as 'lwe_array_in'
* - 'lwe_array_out_ks_buffer' with size:
* ('lwe_dimension_out' + 1) * number_of_samples * sizeof(u64)
* - 'lwe_array_out_pbs_buffer' same size as 'lwe_array_in'
* - 'lut_pbs' with size:
* (glwe_dimension + 1) * (lwe_dimension_in + 1) * sizeof(u64)
* The other inputs are:
* - 'lut_vector_indexes' stores the index corresponding to which test
* vector to use
* - 'ksk' keyswitch key
* - 'fourier_bsk' complex compressed bsk in fourier domain
* - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
* dimensions are: {512, 1024,2048, 4096, 8192}
* - 'lwe_dimension_out' output LWE ciphertext dimension
* - 'glwe_dimension' GLWE dimension, only glwe_dimension = 1 is supported
* for now
* - 'base_log_bsk' base_log for bootstrapping
* - 'level_count_bsk' decomposition level count for bootstrapping
* - 'base_log_ksk' base_log for keyswitch
* - 'level_count_ksk' decomposition level for keyswitch
* - 'number_of_samples' number of input LWE ciphertexts
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function will call corresponding template of wrapper host function which
* will manage the calls of device functions.
*/
void cuda_extract_bits_64(
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
void *lwe_array_in, void *lwe_array_in_buffer,

View File

@@ -11,10 +11,12 @@
#include "polynomial/parameters.cuh"
#include "utils/timer.cuh"
// only works for big lwe for ks+bs case
// state_lwe_buffer is copied from big lwe input
// shifted_lwe_buffer is scalar multiplication of lwe input
// blockIdx.x refers to input ciphertext id
/*
* Function copies batch lwe input to two different buffers,
* one is shifted by value
* one is copied without any modification
* works for ciphertexts with sizes supported by params::degree
*/
template <typename Torus, class params>
__global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
Torus *src, Torus value) {
@@ -37,8 +39,10 @@ __global__ void copy_and_shift_lwe(Torus *dst_copy, Torus *dst_shift,
}
}
// only works for small lwe in ks+bs case
// function copies lwe when length is not a power of two
/*
* Function copies batch of lwe to lwe when size is not supported by
* params::degree
*/
template <typename Torus>
__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
uint32_t number_of_bits, uint32_t lwe_id) {
@@ -62,24 +66,27 @@ __global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
cur_dst[tid] = cur_src[tid];
}
// only used in extract bits for one ciphertext
// should be called with one block and one thread
// NOTE: check if putting this functionality in copy_small_lwe or
// fill_pbs_lut vector is faster
/*
* Function used to wrapping add value on the body of ciphertexts,
* should be called with blocksize.x = 1;
* blickIdx.x refers id of ciphertext
* NOTE: check if putting thi functionality in copy_small_lwe or fill_pbs_lut
* is faster
*/
template <typename Torus>
__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
}
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
// if the extracted bit was 0 and 1 in the other case
//
// Remove the extracted bit from the state LWE to get a 0 at the extracted bit
// location.
//
// Shift on padding bit for next iteration, that's why
// alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
// instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
/*
* Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
* if the extracted bit was 0 and 1 in the other case
* Remove the extracted bit from the state LWE to get a 0 at the extracted bit
* location.
* Shift on padding bit for next iteration, that's why
* alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
* instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
*/
template <typename Torus, class params>
__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
Torus *pbs_lwe_array_out, Torus add_value,
@@ -104,9 +111,11 @@ __global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
}
}
// Fill lut(only body) for the current bit (equivalent to trivial encryption as
// mask is 0s)
// The LUT is filled with value
/*
* Fill lut(only body) for the current bit, equivalent to trivial encryption as
* msk is 0s
* blockIdx.x refers id of lut vector
*/
template <typename Torus, class params>
__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
@@ -119,6 +128,11 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
}
}
/*
* Host function for cuda extract bits.
* it executes device functions in specific order and manages
* parallelism
*/
template <typename Torus, class params>
__host__ void host_extract_bits(
void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
@@ -137,6 +151,7 @@ __host__ void host_extract_bits(
int blocks = 1;
int threads = params::degree / params::opt;
// shift lwe on padding bit and copy in new buffer
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
1ll << (ciphertext_n_bits - delta_log - 1));
@@ -157,11 +172,15 @@ __host__ void host_extract_bits(
break;
}
// Add q/4 to center the error while computing a negacyclic LUT
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
lwe_dimension_out,
1ll << (ciphertext_n_bits - 2));
checkCudaErrors(cudaGetLastError());
// Fill lut for the current bit (equivalent to trivial encryption as mask is
// 0s) The LUT is filled with -alpha in each coefficient where alpha =
// delta*2^{bit_idx-1}
fill_lut_body_for_current_bit<Torus, params>
<<<blocks, threads, 0, *stream>>>(
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
@@ -173,6 +192,8 @@ __host__ void host_extract_bits(
lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
number_of_samples, 1, max_shared_memory);
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
// of 0 if the extracted bit was 0 and 1 in the other case
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),

View File

@@ -1,60 +1,7 @@
#include "bootstrap_amortized.cuh"
/* Perform bootstrapping on a batch of input LWE ciphertexts
*
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* Note: it is necessary to generate (k+1).k.l.N.n
* uniformly random coefficients for the zero encryptions
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - q: number of bytes in the integer representation (32 or 64)
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into level_count levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
/* Perform the programmable bootstrapping on a batch of input u32 LWE
* ciphertexts. See the corresponding operation on 64 bits for more details.
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
@@ -115,6 +62,71 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
}
}
/* Perform the programmable bootstrapping on a batch of input u64 LWE
* ciphertexts. This functions performs best for large numbers of inputs (> 10).
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector of lut_vector to use for each LWE input in
* lwe_array_in
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
* the input LWE array is copied to each GPU, but the whole LUT array is copied
* (because the case when the number of LUTs is smaller than the number of input
* LWEs is not trivial to take into account in the data repartition on the
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
* input in the LUT array `lut_vector`.
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into level_count levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,

View File

@@ -1,60 +1,11 @@
#include "bootstrap_low_latency.cuh"
/* Perform bootstrapping on a batch of input LWE ciphertexts
*
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* Note: it is necessary to generate (k+1).k.l.N.n
* uniformly random coefficients for the zero encryptions
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - q: number of bytes in the integer representation (32 or 64)
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples blocks of threads are launched, where each thread is going
* to handle one or more polynomial coefficients at each stage:
* - perform the blind rotation
* - round the result
* - decompose into level_count levels, then for each level:
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
* This function performs best for small numbers of inputs. Beyond a certain
* number of inputs (the exact number depends on the cryptographic parameters),
* the kernel cannot be launched and it is necessary to split the kernel call
* into several calls on smaller batches of inputs. For more details on this
* operation, head on to the equivalent u64 operation.
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
@@ -124,6 +75,82 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
}
}
/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
* This function performs best for small numbers of inputs. Beyond a certain
* number of inputs (the exact number depends on the cryptographic parameters),
* the kernel cannot be launched and it is necessary to split the kernel call
* into several calls on smaller batches of inputs.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector to use for each sample in
* lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* - bootstrapping_key: GGSW encryption of the LWE secret key sk1
* under secret key sk2
* bsk = Z + sk1 H
* where H is the gadget matrix and Z is a matrix (k+1).l
* containing GLWE encryptions of 0 under sk2.
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* are the same because they have to be in the same ring
* to be multiplied.
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - glwe_dimension: size of the polynomial vector used to encrypt the LUT
* GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
* this parameter.
* - polynomial_size: size of the test polynomial (test vector) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
* the input LWE array is copied to each GPU, but the whole LUT array is copied
* (because the case when the number of LUTs is smaller than the number of input
* LWEs is not trivial to take into account in the data repartition on the
* GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
* input in the LUT array `lut_vector`.
* - 'max_shared_memory' maximum amount of shared memory to be used inside
* device functions
*
* This function calls a wrapper to a device kernel that performs the
* bootstrapping:
* - the kernel is templatized based on integer discretization and
* polynomial degree
* - num_samples * level_count * (glwe_dimension + 1) blocks of threads are
* launched, where each thread is going to handle one or more polynomial
* coefficients at each stage, for a given level of decomposition, either for
* the LUT mask or its body:
* - perform the blind rotation
* - round the result
* - get the decomposition for the current level
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary (some
* synchronizations happen at the block level, some happen between blocks, using
* cooperative groups).
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,

View File

@@ -1,5 +1,9 @@
#include "circuit_bootstrap.cuh"
/*
* Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_circuit_bootstrap_32(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,
@@ -88,6 +92,42 @@ void cuda_circuit_bootstrap_32(
}
}
/*
* Perform circuit bootstrapping on a batch of 64 bit input LWE ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'ggsw_out' output batch of ggsw with size:
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
* polynomial_size * sizeof(u64)
* - 'lwe_array_in' input batch of lwe ciphertexts, with size:
* 'number_of_samples' * '(lwe_dimension' + 1) * sizeof(u64)
* - 'fourier_bsk' bootstrapping key in fourier domain with size:
* 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
* 'polynomial_size' / 2 * sizeof(double2)
* - 'fp_ksk_array' batch of fp-keyswitch keys with size:
* ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
* 'polynomial_size' * sizeof(u64)
* The following 5 parameters are used during calculations, they are not actual
* inputs of the function, they are just allocated memory for calculation
* process, like this, memory can be allocated once and can be used as much
* as needed for different calls of circuit_bootstrap function
* - 'lwe_array_in_shifted_buffer' with size:
* 'number_of_samples' * 'level_cbs' * ('lwe_dimension' + 1) * sizeof(u64)
* - 'lut_vector' with size:
* 'level_cbs' * ('glwe_dimension' + 1) * 'polynomial_size' * sizeof(u64)
* - 'lut_vector_indexes' stores the index corresponding to which test
* vector to use
* - 'lwe_array_out_pbs_buffer' with size
* 'number_of_samples' * 'level_cbs' * ('polynomial_size' + 1) * sizeof(u64)
* - 'lwe_array_in_fp_ks_buffer' with size
* 'number_of_samples' * 'level_cbs' * ('glwe_dimension' + 1) *
* ('polynomial_size' + 1) * sizeof(u64)
*
* This function calls a wrapper to a device kernel that performs the
* circuit bootstrap. The kernel is templatized based on integer discretization
* and polynomial degree.
*/
void cuda_circuit_bootstrap_64(
void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
void *fourier_bsk, void *fp_ksk_array, void *lwe_array_in_shifted_buffer,

View File

@@ -10,9 +10,11 @@
#include "polynomial/parameters.cuh"
#include "utils/timer.cuh"
// works for lwe with generic sizes
// shifted_lwe_buffer is scalar multiplication of lwe input
// blockIdx.x refers to input ciphertext id
/*
* scalar multiplication to value for batch of lwe_ciphertext
* works for any size of lwe input
* blockIdx.x refers to input ciphertext it
*/
template <typename Torus, class params>
__global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
size_t lwe_size) {
@@ -35,9 +37,14 @@ __global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
cur_dst[tid] = cur_src[tid] * value;
}
// Fill lut (equivalent to trivial encryption as mask is 0s)
// The LUT is filled with -alpha in each coefficient where
// alpha = 2^{log(q) - 1 - base_log * level}
/*
* Fill lut, equivalent to trivial encryption as mask is 0s.
* The LUT is filled with -alpha in each coefficient where
* alpha = 2^{log(q) - 1 - base_log * level}
* blockIdx.x refers to lut id
* value is not passed and calculated inside function because lut id is one
* of the variable.
*/
template <typename Torus, class params>
__global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
uint32_t base_log_cbs) {
@@ -55,6 +62,19 @@ __global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
}
}
/*
* copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
* each of the input ciphertext from lwe_src is copied (glwe_dimension + 1)
* times inside lwe_dst, and then value is added to the body.
* blockIdx.x refers to destination lwe ciphertext id: 'dst_lwe_id'
* 'src_lwe_id' = 'dst_lwe_id' / (glwe_dimension + 1)
*
* example: glwe_dimension = 1
* src_0 ... src_n
* / \ / \
* / \ / \
* dst_0 dst_1 dst_2n dst_2n+1
*/
template <typename Torus, class params>
__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
uint32_t ciphertext_n_bits,
@@ -77,6 +97,11 @@ __global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
}
}
/*
* Host function for cuda circuit bootstrap.
* It executes device functions in specific order and manages
* parallelism
*/
template <typename Torus, class params>
__host__ void host_circuit_bootstrap(
void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,

View File

@@ -4,15 +4,8 @@
#include <cstdint>
/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
*
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
*
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
* Head out to the equivalent operation on 64 bits for more details.
*/
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
@@ -24,12 +17,18 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
* (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - ksk: the keyswitch key to be used in the operation
* - base log: the log of the base used in the decomposition (should be the one
* used to create the ksk)
*
* This function calls a wrapper to a device kernel that performs the keyswitch
* - num_samples blocks of threads are launched
@@ -44,6 +43,9 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
/* Perform functional packing keyswitch on a batch of 32 bits input LWE
* ciphertexts. See the equivalent function on 64 bit inputs for more details.
*/
void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array,
uint32_t input_lwe_dimension,
@@ -60,6 +62,27 @@ void cuda_fp_keyswitch_lwe_to_glwe_32(void *v_stream, void *glwe_array_out,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
number_of_input_lwe, number_of_keys);
}
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `glwe_array_out`: output batch of keyswitched ciphertexts
* - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
* lwe_dimension_in mask values + 1 body value
* - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
* operation
* - `base log`: the log of the base used in the decomposition (should be the
* one used to create the ksk)
* - `level_count`: the number of levels used in the decomposition (should be
* the one used to create the fp_ksks).
* - `number_of_input_lwe`: the number of inputs
* - `number_of_keys`: the number of fp_ksks
*
* This function calls a wrapper to a device kernel that performs the functional
* packing keyswitch.
*/
void cuda_fp_keyswitch_lwe_to_glwe_64(void *v_stream, void *glwe_array_out,
void *lwe_array_in, void *fp_ksk_array,
uint32_t input_lwe_dimension,

View File

@@ -1,5 +1,9 @@
#include "multiplication.cuh"
/*
* Perform the multiplication of a u32 input LWE ciphertext vector with a u32
* cleartext vector. See the equivalent operation on u64 data for more details.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,
@@ -11,6 +15,34 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
* input cleartext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `cleartext_array_in` is the cleartext vector used as input, it should have
* been allocated and initialized before calling this function. It should be of
* size `input_lwe_ciphertext_count`.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output. It is also the number
* of cleartexts in the input cleartext vector.
*
* Each cleartext of the input cleartext vector is multiplied to the mask and
* body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
* result of the operation is stored in the output LWE ciphertext vector. The
* two input vectors are unchanged. This function is a wrapper to a device
* function that performs the operation on the GPU.
*/
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *cleartext_array_in, uint32_t input_lwe_dimension,

View File

@@ -1,5 +1,9 @@
#include "negation.cuh"
/*
* Perform the negation of a u32 input LWE ciphertext vector.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,
@@ -10,6 +14,29 @@ void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the negation of a u64 input LWE ciphertext vector.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `input_lwe_dimension` is the number of mask elements in the two input and
* in the output ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
* input LWE ciphertext vector, as well as in the output.
*
* Each element (mask element or body) of the input LWE ciphertext vector is
* negated. The result is stored in the output LWE ciphertext vector. The input
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
* device function that performs the operation on the GPU.
*/
void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
void *lwe_array_out,
void *lwe_array_in,

View File

@@ -1,5 +1,9 @@
#include "vertical_packing.cuh"
/*
* Perform cmux tree on a batch of 32-bit input GGSW ciphertexts.
* Check the equivalent function for 64-bit inputs for more details.
*/
void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
@@ -55,6 +59,33 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
}
}
/*
* Perform Cmux tree on a batch of 64-bit input GGSW ciphertexts
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'glwe_array_out' output batch of GLWE buffer for Cmux tree, 'tau' GLWE's
* will be the output of the function
* - 'ggsw_in' batch of input GGSW ciphertexts, function expects 'r' GGSW
* ciphertexts as input.
* - 'lut_vector' batch of test vectors (LUTs) there should be 2^r LUTs
* inside 'lut_vector' parameter
* - 'glwe_dimension' GLWE dimension, supported values: {1}
* - 'polynomial_size' size of the test polynomial, supported values: {512,
* 1024, 2048, 4096, 8192}
* - 'base_log' base log parameter for cmux block
* - 'level_count' decomposition level for cmux block
* - 'r' number of input GGSW ciphertexts
* - 'tau' number of input LWE ciphertext which were used to generate GGSW
* ciphertexts stored in 'ggsw_in', it is also an amount of output GLWE
* ciphertexts
* - 'max_shared_memory' maximum shared memory amount to be used for cmux
* kernel
*
* This function calls a wrapper to a device kernel that performs the
* Cmux tree. The kernel is templatized based on integer discretization and
* polynomial degree.
*/
void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
void *ggsw_in, void *lut_vector, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
@@ -110,6 +141,29 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
}
}
/*
* Performs blind rotation on batch of 64-bit input ggsw ciphertexts
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'lwe_out' batch of output lwe ciphertexts, there should be 'tau'
* ciphertexts inside 'lwe_out'
* - 'ggsw_in' batch of input ggsw ciphertexts, function expects 'mbr_size'
* ggsw ciphertexts inside 'ggsw_in'
* - 'lut_vector' list of test vectors, function expects 'tau' test vectors
* inside 'lut_vector' parameter
* - 'glwe_dimension' glwe dimension, supported values : {1}
* - 'polynomial_size' size of test polynomial supported sizes: {512, 1024,
* 2048, 4096, 8192}
* - 'base_log' base log parameter
* - 'l_gadget' decomposition level
* - 'max_shared_memory' maximum number of shared memory to be used in
* device functions(kernels)
*
* This function calls a wrapper to a device kernel that performs the
* blind rotation and sample extraction. The kernel is templatized based on
* integer discretization and polynomial degree.
*/
void cuda_blind_rotate_and_sample_extraction_64(
void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
void *lut_vector, uint32_t mbr_size, uint32_t tau, uint32_t glwe_dimension,

View File

@@ -1,5 +1,32 @@
#include "wop_bootstrap.cuh"
/*
* Entry point for cuda circuit bootstrap + vertical packing for batches of
* input 64 bit LWE ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'lwe_array_out' list of output lwe ciphertexts
* - 'lwe_array_in' list of input lwe_ciphertexts
* - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
* compressed complex key.
* - 'cbs_fpksk' list of private functional packing keyswitch keys
* - 'lut_vector' list of test vectors
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {512, 1024, 2048, 4096, 8192}
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input LWE ciphertexts
* - 'level_count_bsk' decomposition level for bootstrapping
* - 'base_log_bsk' base log parameter for bootstrapping
* - 'level_count_pksk' decomposition level for fp-keyswitch
* - 'base_log_pksk' base log parameter for fp-keyswitch
* - 'level_count_cbs' level of circuit bootstrap
* - 'base_log_cbs' base log parameter for circuit bootstrap
* - 'number_of_inputs' number of input LWE ciphertexts
* - 'max_shared_memory' maximum shared memory amount to be used in
* bootstrapping kernel
*
*/
void cuda_circuit_bootstrap_vertical_packing_64(
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
void *fourier_bsk, void *cbs_fpksk, void *lut_vector,
@@ -77,6 +104,40 @@ void cuda_circuit_bootstrap_vertical_packing_64(
}
}
/*
* Entry point for entire without padding programmable bootstrap on 64 bit input
* LWE ciphertexts.
* - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - 'lwe_array_out' list of output lwe ciphertexts
* - 'lwe_array_in' list of input lwe_ciphertexts
* - 'lut_vector' list of test vectors
* - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
* compressed complex key.
* - 'ksk' keyswitch key to use inside extract bits block
* - 'cbs_fpksk' list of fp-keyswitch keys
* - 'glwe_dimension' supported dimensions: {1}
* - 'lwe_dimension' dimension of input lwe ciphertexts
* - 'polynomial_size' size of the test polynomial, supported sizes:
* {512, 1024, 2048, 4096, 8192}
* - 'base_log_bsk' base log parameter for bootstrapping
* - 'level_count_bsk' decomposition level for bootstrapping
* - 'base_log_ksk' base log parameter for keyswitch
* - 'level_count_ksk' decomposition level for keyswitch
* - 'base_log_pksk' base log parameter for fp-keyswitch
* - 'level_count_pksk' decomposition level for fp-keyswitch
* - 'base_log_cbs' base log parameter for circuit bootstrap
* - 'level_count_cbs' level of circuit bootstrap
* - 'number_of_bits_of_message_including_padding' number of bits to extract
* from each input lwe ciphertext including padding bit
* - 'number_of_bits_to_extract' number of bits to extract
* from each input lwe ciphertext without padding bit
* - 'number_of_inputs' number of input lwe ciphertexts
* - 'max_shared_memory' maximum shared memory amount to be used in
* bootstrapping kernel
*
*/
void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
void *lwe_array_in, void *lut_vector, void *fourier_bsk,
void *ksk, void *cbs_fpksk, uint32_t glwe_dimension,