mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
feat(gpu): implement 128-bit multi-bit PBS
This commit is contained in:
@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
|
||||
uint32_t polynomial_size);
|
||||
template <typename Torus>
|
||||
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle);
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle);
|
||||
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputTorus>
|
||||
struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
|
||||
int8_t *d_mem_keybundle = NULL;
|
||||
int8_t *d_mem_acc_step_one = NULL;
|
||||
int8_t *d_mem_acc_step_two = NULL;
|
||||
int8_t *d_mem_acc_cg = NULL;
|
||||
int8_t *d_mem_acc_tbc = NULL;
|
||||
uint32_t lwe_chunk_size;
|
||||
double *keybundle_fft;
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory, uint64_t *size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->lwe_chunk_size = lwe_chunk_size;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
// default
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
|
||||
__uint128_t>(polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
__uint128_t>(polynomial_size);
|
||||
// cg
|
||||
uint64_t full_sm_cg_accumulate =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_cg_accumulate =
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
|
||||
__uint128_t>(polynomial_size);
|
||||
|
||||
auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
level_count;
|
||||
auto num_blocks_acc_step_one =
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
|
||||
auto num_blocks_acc_step_two =
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1);
|
||||
auto num_blocks_acc_cg =
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
|
||||
|
||||
// Keybundle
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
// Accumulator CG
|
||||
if (max_shared_memory < partial_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
else if (max_shared_memory < full_sm_cg_accumulate)
|
||||
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
// Accumulator step one
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
|
||||
gpu_index, size_tracker, allocate_gpu_memory);
|
||||
else if (max_shared_memory < full_sm_accumulate_step_one)
|
||||
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
|
||||
gpu_index, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// Accumulator step two
|
||||
if (max_shared_memory < full_sm_accumulate_step_two)
|
||||
d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
|
||||
gpu_index, size_tracker, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
|
||||
keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * 4 * sizeof(double),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
|
||||
if (d_mem_keybundle)
|
||||
cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
switch (pbs_variant) {
|
||||
case DEFAULT:
|
||||
if (d_mem_acc_step_one)
|
||||
cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
if (d_mem_acc_step_two)
|
||||
cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
|
||||
gpu_index, gpu_memory_allocated);
|
||||
break;
|
||||
case CG:
|
||||
if (d_mem_acc_cg)
|
||||
cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unsupported implementation variant.")
|
||||
}
|
||||
|
||||
cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CUDA_MULTI_BIT_UTILITIES_H
|
||||
|
||||
@@ -240,7 +240,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
|
||||
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
|
||||
|
||||
template <typename InputTorus>
|
||||
struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
|
||||
int8_t *d_mem;
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
|
||||
@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
int8_t **buffer);
|
||||
}
|
||||
|
||||
#endif // CUDA_MULTI_BIT_H
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
|
||||
pbs/programmable_bootstrap_multibit_128.cu)
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
|
||||
target_include_directories(tfhe_cuda_backend PRIVATE .)
|
||||
|
||||
@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
|
||||
r = __ll2double_rn(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
|
||||
double &r) {
|
||||
// We truncate x
|
||||
r = __ll2double_rn(static_cast<uint64_t>(x));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T init_decomposer_state(T input, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
|
||||
@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
|
||||
}
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// same as convert_u128_to_f128_as_torus() but expects input to be on registers
|
||||
template <class params>
|
||||
__device__ void convert_u128_on_regs_to_f128_as_torus(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
|
||||
|
||||
const double normalization = pow(2., -128.);
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
|
||||
auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
|
||||
|
||||
out_re_hi[tid] = out_re.hi * normalization;
|
||||
out_re_lo[tid] = out_re.lo * normalization;
|
||||
out_im_hi[tid] = out_im.hi * normalization;
|
||||
out_im_lo[tid] = out_im.lo * normalization;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ void
|
||||
convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
|
||||
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// converts standqard input into complex<128> represented by 4 double
|
||||
// converts standard input into complex<128> represented by 4 double
|
||||
// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
|
||||
// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n]
|
||||
template <class params>
|
||||
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
// converts standqard input into complex<128> represented by 4 double
|
||||
// converts standard input into complex<128> represented by 4 double
|
||||
// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
|
||||
// re_lo_1, im_hi_1, im_lo_1,
|
||||
// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
|
||||
|
||||
@@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor) {
|
||||
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
level_count * (1 << grouping_factor) /
|
||||
grouping_factor;
|
||||
size_t buffer_size =
|
||||
total_polynomials * polynomial_size * sizeof(__uint128_t);
|
||||
|
||||
cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
|
||||
static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ const uint64_t *
|
||||
get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
|
||||
@@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
|
||||
int glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ const __uint128_t *
|
||||
get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
|
||||
int k, int level,
|
||||
uint32_t grouping_factor,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t level_count);
|
||||
|
||||
template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
|
||||
const uint64_t *ptr, int g, int i, int k, int level,
|
||||
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
|
||||
@@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
/** Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
* each block operating on a single level for mask and body.
|
||||
* Both operands should be at fourier domain
|
||||
*
|
||||
* This function assumes:
|
||||
* - Thread blocks at dimension z relates to the decomposition level.
|
||||
* - Thread blocks at dimension y relates to the glwe dimension.
|
||||
* - polynomial_size / params::opt threads are available per block
|
||||
*/
|
||||
template <typename G, class params>
|
||||
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
|
||||
double *fft, double *join_buffer,
|
||||
const double *__restrict__ bootstrapping_key, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
const uint32_t polynomial_size = params::degree;
|
||||
const uint32_t glwe_dimension = gridDim.y - 1;
|
||||
const uint32_t level_count = gridDim.z;
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
auto bsk_slice = get_ith_mask_kth_block_128(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
for (int j = 0; j < glwe_dimension + 1; j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
|
||||
auto buffer_slice = get_join_buffer_element_128<G>(
|
||||
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain_128<params>(
|
||||
buffer_slice, fft, bsk_poly, j == 0);
|
||||
group.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 0; l < level_count; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element_128<G>(
|
||||
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
|
||||
l == 0);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
|
||||
@@ -280,8 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
|
||||
@@ -18,62 +18,6 @@
|
||||
#include "programmable_bootstrap.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
/** Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
* each block operating on a single level for mask and body.
|
||||
* Both operands should be at fourier domain
|
||||
*
|
||||
* This function assumes:
|
||||
* - Thread blocks at dimension z relates to the decomposition level.
|
||||
* - Thread blocks at dimension y relates to the glwe dimension.
|
||||
* - polynomial_size / params::opt threads are available per block
|
||||
*/
|
||||
template <typename G, class params>
|
||||
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
|
||||
double *fft, double *join_buffer,
|
||||
const double *__restrict__ bootstrapping_key, int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
const uint32_t polynomial_size = params::degree;
|
||||
const uint32_t glwe_dimension = gridDim.y - 1;
|
||||
const uint32_t level_count = gridDim.z;
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
auto bsk_slice = get_ith_mask_kth_block_128(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
for (int j = 0; j < glwe_dimension + 1; j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
|
||||
auto buffer_slice = get_join_buffer_element_128<G>(
|
||||
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain_128<params>(
|
||||
buffer_slice, fft, bsk_poly, j == 0);
|
||||
group.sync();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 0; l < level_count; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element_128<G>(
|
||||
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
|
||||
support_dsm);
|
||||
|
||||
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
|
||||
l == 0);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD,
|
||||
bool first_iter>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
@@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
accumulator);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
// Switch to the FFT space
|
||||
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
|
||||
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
|
||||
|
||||
@@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle) {
|
||||
|
||||
int max_blocks_per_sm;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
@@ -521,8 +521,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
|
||||
@@ -0,0 +1,361 @@
|
||||
#include "programmable_bootstrap_cg_multibit.cuh"
|
||||
#include "programmable_bootstrap_multibit_128.cuh"
|
||||
|
||||
template <typename InputTorus>
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap_128<
|
||||
InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
__uint128_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_cg)
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
|
||||
stream, gpu_index,
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
|
||||
stream, gpu_index,
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
|
||||
|
||||
auto *buffer =
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
|
||||
uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
const auto x =
|
||||
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes divisors of the product of num_sms (streaming multiprocessors on the
|
||||
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
|
||||
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
|
||||
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
|
||||
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
|
||||
* - 13) for additional logic.
|
||||
*
|
||||
* The value 13 was empirically determined based on memory requirements for
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size,
|
||||
uint64_t full_sm_keybundle) {
|
||||
|
||||
int max_blocks_per_sm;
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
cuda_set_device(gpu_index);
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
|
||||
NOSM>,
|
||||
polynomial_size / params::opt, full_sm_keybundle);
|
||||
else
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
|
||||
FULLSM>,
|
||||
polynomial_size / params::opt, 0);
|
||||
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
int x = num_sms * max_blocks_per_sm;
|
||||
int count = 0;
|
||||
|
||||
int divisor = 1;
|
||||
int ith_divisor = 0;
|
||||
|
||||
#if CUDA_ARCH < 900
|
||||
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
|
||||
// multiplication can run
|
||||
int log2_max_num_pbs = log2_int(max_num_pbs);
|
||||
if (log2_max_num_pbs > 13)
|
||||
ith_divisor = log2_max_num_pbs - 11;
|
||||
#endif
|
||||
|
||||
for (int i = sqrt(x); i >= 1; i--) {
|
||||
if (x % i == 0) {
|
||||
if (count == ith_divisor) {
|
||||
divisor = i;
|
||||
break;
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return divisor;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -283,8 +283,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size =
|
||||
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
|
||||
polynomial_size, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
|
||||
@@ -5,15 +5,15 @@
|
||||
#include <stdio.h>
|
||||
#include <type_traits>
|
||||
|
||||
template <typename T> inline __device__ const char *get_format();
|
||||
template <typename T> __device__ inline const char *get_format();
|
||||
|
||||
template <> inline __device__ const char *get_format<int>() { return "%d, "; }
|
||||
template <> __device__ inline const char *get_format<int>() { return "%d, "; }
|
||||
|
||||
template <> inline __device__ const char *get_format<unsigned int>() {
|
||||
template <> __device__ inline const char *get_format<unsigned int>() {
|
||||
return "%u, ";
|
||||
}
|
||||
|
||||
template <> inline __device__ const char *get_format<uint64_t>() {
|
||||
template <> __device__ inline const char *get_format<uint64_t>() {
|
||||
return "%lu, ";
|
||||
}
|
||||
|
||||
@@ -23,6 +23,15 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
uint64_t low = static_cast<uint64_t>(src[i]);
|
||||
uint64_t high = static_cast<uint64_t>(src[i] >> 64);
|
||||
printf("(%llu, %llu), ", high, low);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
__global__ inline void print_debug_kernel(const double2 *src, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
|
||||
@@ -2188,6 +2188,19 @@ unsafe extern "C" {
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
dest: *mut ffi::c_void,
|
||||
src: *const ffi::c_void,
|
||||
input_lwe_dim: u32,
|
||||
glwe_dim: u32,
|
||||
level_count: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -2230,3 +2243,45 @@ unsafe extern "C" {
|
||||
pbs_buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lut_vector_indexes: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -165,7 +165,7 @@ fn pbs_128(c: &mut Criterion) {
|
||||
mod cuda {
|
||||
use benchmark::utilities::{
|
||||
cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
|
||||
write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord,
|
||||
write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
|
||||
CudaLocalKeys, OperatorType,
|
||||
};
|
||||
use criterion::{black_box, Criterion, Throughput};
|
||||
@@ -173,12 +173,14 @@ mod cuda {
|
||||
use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use tfhe::core_crypto::gpu::{
|
||||
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext,
|
||||
cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams,
|
||||
};
|
||||
use tfhe::core_crypto::prelude::*;
|
||||
use tfhe::shortint::engine::ShortintEngine;
|
||||
use tfhe::shortint::parameters::{
|
||||
ModulusSwitchType, NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
|
||||
@@ -441,14 +443,281 @@ mod cuda {
|
||||
);
|
||||
}
|
||||
|
||||
fn cuda_multi_bit_pbs_128(c: &mut Criterion) {
|
||||
let bench_name = "core_crypto::cuda::multi_bit_pbs128";
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(10)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
|
||||
type Scalar = u128;
|
||||
let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
let squash_params =
|
||||
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
|
||||
let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
|
||||
|
||||
let params_name = "PARAMS_SWITCH_SQUASH";
|
||||
|
||||
let mut boxed_seeder = new_seeder();
|
||||
let seeder = boxed_seeder.as_mut();
|
||||
|
||||
let mut secret_generator =
|
||||
SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
|
||||
|
||||
let mut encryption_generator =
|
||||
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
|
||||
|
||||
let input_lwe_secret_key =
|
||||
LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);
|
||||
|
||||
let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
|
||||
squash_params.glwe_dimension,
|
||||
squash_params.polynomial_size,
|
||||
&mut secret_generator,
|
||||
);
|
||||
|
||||
let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
|
||||
|
||||
let multi_bit_bsk = LweMultiBitBootstrapKey::new(
|
||||
Scalar::ZERO,
|
||||
squash_params.glwe_dimension.to_glwe_size(),
|
||||
squash_params.polynomial_size,
|
||||
squash_params.decomp_base_log,
|
||||
squash_params.decomp_level_count,
|
||||
input_params.lwe_dimension,
|
||||
squash_params.grouping_factor,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
|
||||
.multi_bit_bootstrap_key(multi_bit_bsk)
|
||||
.build();
|
||||
|
||||
let message_modulus: u64 = 1 << 4;
|
||||
let input_message: u64 = 3;
|
||||
let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
|
||||
let plaintext = Plaintext(input_message * delta);
|
||||
|
||||
let bench_id;
|
||||
|
||||
match get_bench_type() {
|
||||
BenchmarkType::Latency => {
|
||||
let streams = CudaStreams::new_multi_gpu();
|
||||
let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
|
||||
|
||||
let lwe_ciphertext_in: LweCiphertextOwned<u64> =
|
||||
allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&input_lwe_secret_key,
|
||||
plaintext,
|
||||
lwe_noise_distribution_u64,
|
||||
ct_modulus_u64,
|
||||
&mut encryption_generator,
|
||||
);
|
||||
let lwe_ciphertext_in_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams);
|
||||
|
||||
let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
|
||||
Scalar::ONE,
|
||||
squash_params.glwe_dimension.to_glwe_size(),
|
||||
squash_params.polynomial_size,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
let accumulator_gpu =
|
||||
CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
|
||||
|
||||
let out_pbs_ct = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
let mut out_pbs_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
|
||||
|
||||
let h_indexes = [0];
|
||||
let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
|
||||
|
||||
bench_id = format!("{bench_name}::{params_name}");
|
||||
{
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
b.iter(|| {
|
||||
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
|
||||
&lwe_ciphertext_in_gpu,
|
||||
&mut out_pbs_ct_gpu,
|
||||
&accumulator_gpu,
|
||||
&cuda_indexes.d_lut,
|
||||
&cuda_indexes.d_output,
|
||||
&cuda_indexes.d_input,
|
||||
gpu_keys.multi_bit_bsk.as_ref().unwrap(),
|
||||
&streams,
|
||||
);
|
||||
black_box(&mut out_pbs_ct_gpu);
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
|
||||
let gpu_count = get_number_of_gpus() as usize;
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{params_name}");
|
||||
let blocks: usize = 1;
|
||||
let elements = throughput_num_threads(blocks, 1);
|
||||
let elements_per_stream = elements as usize / gpu_count;
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let setup_encrypted_values = || {
|
||||
let local_streams = cuda_local_streams_core();
|
||||
|
||||
let plaintext_list =
|
||||
PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));
|
||||
|
||||
let input_cts = (0..gpu_count)
|
||||
.map(|i| {
|
||||
let mut input_ct_list = LweCiphertextList::new(
|
||||
u64::ZERO,
|
||||
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
ct_modulus_u64,
|
||||
);
|
||||
|
||||
encrypt_lwe_ciphertext_list(
|
||||
&input_lwe_secret_key,
|
||||
&mut input_ct_list,
|
||||
&plaintext_list,
|
||||
lwe_noise_distribution_u64,
|
||||
&mut encryption_generator,
|
||||
);
|
||||
|
||||
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&input_ct_list,
|
||||
&local_streams[i],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let accumulators = (0..gpu_count)
|
||||
.map(|i| {
|
||||
let accumulator = GlweCiphertextOwned::new(
|
||||
Scalar::ONE,
|
||||
squash_params.glwe_dimension.to_glwe_size(),
|
||||
squash_params.polynomial_size,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
CudaGlweCiphertextList::from_glwe_ciphertext(
|
||||
&accumulator,
|
||||
&local_streams[i],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Allocate the LweCiphertext to store the result of the PBS
|
||||
let output_cts = (0..gpu_count)
|
||||
.map(|i| {
|
||||
let output_ct_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&output_ct_list,
|
||||
&local_streams[i],
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let h_indexes = (0..(elements / gpu_count as u64))
|
||||
.map(CastFrom::cast_from)
|
||||
.collect::<Vec<_>>();
|
||||
let cuda_indexes_vec = (0..gpu_count)
|
||||
.map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
|
||||
.collect::<Vec<_>>();
|
||||
local_streams.iter().for_each(|stream| stream.synchronize());
|
||||
|
||||
(
|
||||
input_cts,
|
||||
output_cts,
|
||||
accumulators,
|
||||
cuda_indexes_vec,
|
||||
local_streams,
|
||||
)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
setup_encrypted_values,
|
||||
|(
|
||||
input_cts,
|
||||
mut output_cts,
|
||||
accumulators,
|
||||
cuda_indexes_vec,
|
||||
local_streams,
|
||||
)| {
|
||||
(0..gpu_count)
|
||||
.into_par_iter()
|
||||
.zip(input_cts.par_iter())
|
||||
.zip(output_cts.par_iter_mut())
|
||||
.zip(accumulators.par_iter())
|
||||
.zip(local_streams.par_iter())
|
||||
.for_each(
|
||||
|((((i, input_ct), output_ct), accumulator), local_stream)| {
|
||||
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
|
||||
input_ct,
|
||||
output_ct,
|
||||
accumulator,
|
||||
&cuda_indexes_vec[i].d_lut,
|
||||
&cuda_indexes_vec[i].d_output,
|
||||
&cuda_indexes_vec[i].d_input,
|
||||
gpu_keys_vec[i].multi_bit_bsk.as_ref().unwrap(),
|
||||
local_stream,
|
||||
);
|
||||
},
|
||||
)
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
);
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let params_record = CryptoParametersRecord {
|
||||
lwe_dimension: Some(input_params.lwe_dimension),
|
||||
glwe_dimension: Some(squash_params.glwe_dimension),
|
||||
polynomial_size: Some(squash_params.polynomial_size),
|
||||
lwe_noise_distribution: Some(lwe_noise_distribution_u64),
|
||||
glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
|
||||
pbs_base_log: Some(squash_params.decomp_base_log),
|
||||
pbs_level: Some(squash_params.decomp_level_count),
|
||||
ciphertext_modulus: Some(input_params.ciphertext_modulus),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let bit_size = (message_modulus as u32).ilog2();
|
||||
write_to_json(
|
||||
&bench_id,
|
||||
params_record,
|
||||
params_name,
|
||||
"pbs",
|
||||
&OperatorType::Atomic,
|
||||
bit_size,
|
||||
vec![bit_size],
|
||||
);
|
||||
}
|
||||
|
||||
pub fn cuda_pbs128_group() {
|
||||
let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
|
||||
cuda_pbs_128(&mut criterion);
|
||||
}
|
||||
|
||||
pub fn cuda_multi_bit_pbs128_group() {
|
||||
let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
|
||||
cuda_multi_bit_pbs_128(&mut criterion);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use cuda::cuda_pbs128_group;
|
||||
use cuda::{cuda_multi_bit_pbs128_group, cuda_pbs128_group};
|
||||
|
||||
pub fn pbs128_group() {
|
||||
let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
|
||||
@@ -458,6 +727,7 @@ pub fn pbs128_group() {
|
||||
#[cfg(feature = "gpu")]
|
||||
fn go_through_gpu_bench_groups() {
|
||||
cuda_pbs128_group();
|
||||
cuda_multi_bit_pbs128_group();
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
|
||||
@@ -521,7 +521,7 @@ mod cuda_utils {
|
||||
pub ksk: Option<CudaLweKeyswitchKey<T>>,
|
||||
pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
|
||||
pub bsk: Option<CudaLweBootstrapKey>,
|
||||
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey>,
|
||||
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
|
||||
@@ -13,6 +13,8 @@ use tfhe::shortint::parameters::current_params::{
|
||||
VEC_ALL_COMPRESSION_PARAMETERS, VEC_ALL_HPU_PARAMETERS, VEC_ALL_KS32_PARAMETERS,
|
||||
VEC_ALL_MULTI_BIT_PBS_PARAMETERS, VEC_ALL_NOISE_SQUASHING_PARAMETERS,
|
||||
};
|
||||
use tfhe::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
|
||||
use tfhe::shortint::parameters::v1_3::VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS;
|
||||
use tfhe::shortint::parameters::{
|
||||
CompactPublicKeyEncryptionParameters, CompressionParameters, NoiseSquashingParameters,
|
||||
};
|
||||
@@ -214,6 +216,36 @@ impl ParamDetails<u128> for NoiseSquashingParameters {
|
||||
}
|
||||
}
|
||||
|
||||
impl ParamDetails<u128> for NoiseSquashingMultiBitParameters {
|
||||
fn lwe_dimension(&self) -> LweDimension {
|
||||
panic!("lwe_dimension not applicable for NoiseSquashingMultiBitParameters")
|
||||
}
|
||||
|
||||
fn glwe_dimension(&self) -> GlweDimension {
|
||||
self.glwe_dimension
|
||||
}
|
||||
|
||||
fn lwe_noise_distribution(&self) -> DynamicDistribution<u128> {
|
||||
panic!("lwe_noise_distribution not applicable for NoiseSquashingMultiBitParameters")
|
||||
}
|
||||
|
||||
fn glwe_noise_distribution(&self) -> DynamicDistribution<u128> {
|
||||
self.glwe_noise_distribution
|
||||
}
|
||||
|
||||
fn polynomial_size(&self) -> PolynomialSize {
|
||||
self.polynomial_size
|
||||
}
|
||||
|
||||
fn lwe_ciphertext_modulus(&self) -> ParamModulus {
|
||||
panic!("lwe_ciphertext_modulus not applicable for NoiseSquashingMultiBitParameters")
|
||||
}
|
||||
|
||||
fn glwe_ciphertext_modulus(&self) -> ParamModulus {
|
||||
ParamModulus::from_ciphertext_modulus(self.ciphertext_modulus)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Hash)]
|
||||
enum ParametersFormat {
|
||||
Lwe,
|
||||
@@ -493,6 +525,16 @@ fn main() {
|
||||
ParametersFormat::Glwe,
|
||||
);
|
||||
|
||||
let noise_squasing_multi_bit_params: Vec<_> = VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS
|
||||
.into_iter()
|
||||
.map(|p| (*p.0, Some(p.1)))
|
||||
.collect();
|
||||
write_all_params_in_file(
|
||||
"shortint_noise_squashing_multi_bit_parameters_lattice_estimator.sage",
|
||||
&noise_squasing_multi_bit_params,
|
||||
ParametersFormat::Glwe,
|
||||
);
|
||||
|
||||
let ks32_params: Vec<_> = VEC_ALL_KS32_PARAMETERS
|
||||
.into_iter()
|
||||
.map(|p| (AtomicPatternParameters::from(*p.0), Some(p.1)))
|
||||
|
||||
@@ -178,6 +178,36 @@ pub const DUMMY_31_U32: ClassicTestParams<u32> = ClassicTestParams {
|
||||
ciphertext_modulus: CiphertextModulus::new(1 << 31),
|
||||
};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingMultiBitTestParameters<u128> = NoiseSquashingMultiBitTestParameters {
|
||||
glwe_dimension: GlweDimension(2),
|
||||
polynomial_size: PolynomialSize(2048),
|
||||
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
|
||||
decomp_base_log: DecompositionBaseLog(23),
|
||||
decomp_level_count: DecompositionLevelCount(3),
|
||||
grouping_factor: LweBskGroupingFactor(4),
|
||||
message_modulus_log: MessageModulusLog(4),
|
||||
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: MultiBitTestParams<
|
||||
u64,
|
||||
> = MultiBitTestParams {
|
||||
input_lwe_dimension: LweDimension(920),
|
||||
lwe_noise_distribution: DynamicDistribution::new_t_uniform(45),
|
||||
decomp_base_log: DecompositionBaseLog(22),
|
||||
decomp_level_count: DecompositionLevelCount(1),
|
||||
glwe_dimension: GlweDimension(1),
|
||||
polynomial_size: PolynomialSize(2048),
|
||||
glwe_noise_distribution: DynamicDistribution::new_t_uniform(17),
|
||||
message_modulus_log: MessageModulusLog(4),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
grouping_factor: LweBskGroupingFactor(4),
|
||||
thread_count: ThreadCount(5),
|
||||
};
|
||||
|
||||
pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
|
||||
input_lwe_dimension: LweDimension(818),
|
||||
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
|
||||
|
||||
@@ -3,6 +3,8 @@ use crate::core_crypto::entities::*;
|
||||
use crate::core_crypto::prelude::{CastFrom, CastInto, UnsignedInteger};
|
||||
use crate::keycache::NamedParam;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::shortint::parameters::ModulusSwitchNoiseReductionParams;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::shortint::parameters::ModulusSwitchType;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -22,6 +24,13 @@ pub struct MultiBitBootstrapKeys<Scalar: UnsignedInteger> {
|
||||
pub fbsk: FourierLweMultiBitBootstrapKeyOwned,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct MultiBitStdBootstrapKeys<Scalar: UnsignedInteger> {
|
||||
pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub bsk: LweMultiBitBootstrapKeyOwned<Scalar>,
|
||||
}
|
||||
|
||||
// Fourier key is generated afterward in order to use generic test function
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct FftBootstrapKeys<Scalar: UnsignedInteger> {
|
||||
@@ -81,6 +90,18 @@ pub struct MultiBitTestParams<Scalar: UnsignedInteger> {
|
||||
pub thread_count: ThreadCount,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
pub struct NoiseSquashingMultiBitTestParameters<Scalar: UnsignedInteger> {
|
||||
pub glwe_dimension: GlweDimension,
|
||||
pub polynomial_size: PolynomialSize,
|
||||
pub glwe_noise_distribution: DynamicDistribution<Scalar>,
|
||||
pub decomp_base_log: DecompositionBaseLog,
|
||||
pub decomp_level_count: DecompositionLevelCount,
|
||||
pub grouping_factor: LweBskGroupingFactor,
|
||||
pub message_modulus_log: MessageModulusLog,
|
||||
pub ciphertext_modulus: CiphertextModulus<Scalar>,
|
||||
}
|
||||
|
||||
// PartialEq is implemented manually because thread_count doesn't affect key generation and we want
|
||||
// to change its value in test without the need of regenerating keys in the key cache.
|
||||
impl<Scalar: UnsignedInteger> PartialEq for MultiBitTestParams<Scalar> {
|
||||
@@ -141,6 +162,21 @@ pub struct NoiseSquashingTestParams<Scalar: UnsignedInteger> {
|
||||
pub modulus_switch_noise_reduction_params: ModulusSwitchType,
|
||||
pub ciphertext_modulus: CiphertextModulus<Scalar>,
|
||||
}
|
||||
// Parameters to test NoiseSquashing implementation
|
||||
#[cfg(feature = "gpu")]
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct NoiseSquashingMultiBitTestParams<Scalar: UnsignedInteger> {
|
||||
pub lwe_dimension: LweDimension,
|
||||
pub glwe_dimension: GlweDimension,
|
||||
pub polynomial_size: PolynomialSize,
|
||||
pub lwe_noise_distribution: DynamicDistribution<Scalar>,
|
||||
pub glwe_noise_distribution: DynamicDistribution<Scalar>,
|
||||
pub pbs_base_log: DecompositionBaseLog,
|
||||
pub pbs_level: DecompositionLevelCount,
|
||||
pub grouping_factor: LweBskGroupingFactor,
|
||||
pub modulus_switch_noise_reduction_params: Option<ModulusSwitchNoiseReductionParams>,
|
||||
pub ciphertext_modulus: CiphertextModulus<Scalar>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct PackingKeySwitchTestParams<Scalar: UnsignedInteger> {
|
||||
|
||||
@@ -17,7 +17,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
@@ -151,7 +151,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
output_indexes: &CudaVec<Scalar>,
|
||||
input_indexes: &CudaVec<Scalar>,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
@@ -171,3 +171,162 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
}
|
||||
streams.synchronize();
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until streams is synchronised
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async<OutputScalar>(
|
||||
input: &CudaLweCiphertextList<u64>,
|
||||
output: &mut CudaLweCiphertextList<OutputScalar>,
|
||||
accumulator: &CudaGlweCiphertextList<OutputScalar>,
|
||||
lut_indexes: &CudaVec<u64>,
|
||||
output_indexes: &CudaVec<u64>,
|
||||
input_indexes: &CudaVec<u64>,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<OutputScalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
OutputScalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
assert_eq!(
|
||||
input.lwe_dimension(),
|
||||
multi_bit_bsk.input_lwe_dimension(),
|
||||
"Mismatched input LweDimension. LweCiphertext input LweDimension {:?}. \
|
||||
FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
|
||||
input.lwe_dimension(),
|
||||
multi_bit_bsk.input_lwe_dimension(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
output.lwe_dimension(),
|
||||
multi_bit_bsk.output_lwe_dimension(),
|
||||
"Mismatched output LweDimension. LweCiphertext output LweDimension {:?}. \
|
||||
FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
|
||||
output.lwe_dimension(),
|
||||
multi_bit_bsk.output_lwe_dimension(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
accumulator.glwe_dimension(),
|
||||
multi_bit_bsk.glwe_dimension(),
|
||||
"Mismatched GlweSize. Accumulator GlweSize {:?}. \
|
||||
FourierLweMultiBitBootstrapKey GlweSize {:?}.",
|
||||
accumulator.glwe_dimension(),
|
||||
multi_bit_bsk.glwe_dimension(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
accumulator.polynomial_size(),
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
"Mismatched PolynomialSize. Accumulator PolynomialSize {:?}. \
|
||||
FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
|
||||
accumulator.polynomial_size(),
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
output.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
"Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
multi_bit_bsk.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
multi_bit_bsk.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
input.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
input.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
output.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
output.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
accumulator.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first accumulator pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
accumulator.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
input_indexes.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first input indexes pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
input_indexes.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
output_indexes.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first output indexes pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
output_indexes.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
lut_indexes.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first lut indexes pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
lut_indexes.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
programmable_bootstrap_multi_bit_async(
|
||||
streams,
|
||||
&mut output.0.d_vec,
|
||||
output_indexes,
|
||||
&accumulator.0.d_vec,
|
||||
lut_indexes,
|
||||
&input.0.d_vec,
|
||||
input_indexes,
|
||||
&multi_bit_bsk.d_vec,
|
||||
input.lwe_dimension(),
|
||||
multi_bit_bsk.glwe_dimension(),
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
multi_bit_bsk.decomp_base_log(),
|
||||
multi_bit_bsk.decomp_level_count(),
|
||||
multi_bit_bsk.grouping_factor(),
|
||||
input.lwe_ciphertext_count().0 as u32,
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext<Scalar>(
|
||||
input: &CudaLweCiphertextList<u64>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<u64>,
|
||||
output_indexes: &CudaVec<u64>,
|
||||
input_indexes: &CudaVec<u64>,
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
unsafe {
|
||||
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async(
|
||||
input,
|
||||
output,
|
||||
accumulator,
|
||||
lut_indexes,
|
||||
output_indexes,
|
||||
input_indexes,
|
||||
multi_bit_bsk,
|
||||
streams,
|
||||
);
|
||||
}
|
||||
streams.synchronize();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,187 @@
|
||||
use super::*;
|
||||
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
|
||||
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
|
||||
use crate::core_crypto::gpu::{
|
||||
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext, CudaStreams,
|
||||
};
|
||||
use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
|
||||
use itertools::Itertools;
|
||||
|
||||
fn execute_multibit_bootstrap_u128(
|
||||
squash_params: NoiseSquashingMultiBitTestParameters<u128>,
|
||||
input_params: MultiBitTestParams<u64>,
|
||||
) {
|
||||
let input_lwe_dimension = input_params.input_lwe_dimension;
|
||||
let lwe_noise_distribution = input_params.lwe_noise_distribution;
|
||||
let glwe_noise_distribution = squash_params.glwe_noise_distribution;
|
||||
let ciphertext_modulus = squash_params.ciphertext_modulus;
|
||||
let ciphertext_modulus_64 = CiphertextModulus::new_native();
|
||||
let msg_modulus = input_params.message_modulus_log;
|
||||
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
|
||||
let encoding_with_padding_64: u64 = get_encoding_with_padding(ciphertext_modulus_64);
|
||||
let glwe_dimension = squash_params.glwe_dimension;
|
||||
let polynomial_size = squash_params.polynomial_size;
|
||||
let decomp_base_log = squash_params.decomp_base_log;
|
||||
let decomp_level_count = squash_params.decomp_level_count;
|
||||
let grouping_factor = squash_params.grouping_factor;
|
||||
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let f = |x: u128| x % msg_modulus.0 as u128;
|
||||
|
||||
let delta = encoding_with_padding / msg_modulus.0 as u128;
|
||||
let delta_64 = encoding_with_padding_64 / msg_modulus.0 as u64;
|
||||
let mut msg = msg_modulus.0 as u64;
|
||||
const NB_TESTS: usize = 10;
|
||||
let number_of_messages = 1;
|
||||
|
||||
let accumulator = generate_programmable_bootstrap_glwe_lut(
|
||||
polynomial_size,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
msg_modulus.0.cast_into(),
|
||||
ciphertext_modulus,
|
||||
delta,
|
||||
f,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&accumulator,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
// Create the LweSecretKey
|
||||
let small_lwe_sk: LweSecretKeyOwned<u128> = allocate_and_generate_new_binary_lwe_secret_key(
|
||||
input_lwe_dimension,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
let input_lwe_secret_key = LweSecretKey::from_container(
|
||||
small_lwe_sk
|
||||
.clone()
|
||||
.into_container()
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|x| x as u64)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
let output_glwe_secret_key: GlweSecretKeyOwned<u128> =
|
||||
allocate_and_generate_new_binary_glwe_secret_key(
|
||||
glwe_dimension,
|
||||
polynomial_size,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
|
||||
let output_lwe_dimension = output_lwe_secret_key.lwe_dimension();
|
||||
|
||||
let mut bsk = LweMultiBitBootstrapKey::new(
|
||||
0u128,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
polynomial_size,
|
||||
decomp_base_log,
|
||||
decomp_level_count,
|
||||
input_lwe_dimension,
|
||||
grouping_factor,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
par_generate_lwe_multi_bit_bootstrap_key(
|
||||
&small_lwe_sk,
|
||||
&output_glwe_secret_key,
|
||||
&mut bsk,
|
||||
glwe_noise_distribution,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&*bsk,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_bsk = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(&bsk, &stream);
|
||||
|
||||
while msg != 0 {
|
||||
msg -= 1;
|
||||
for _ in 0..NB_TESTS {
|
||||
let plaintext = Plaintext(msg * delta_64);
|
||||
|
||||
let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&input_lwe_secret_key,
|
||||
plaintext,
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus_64,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&lwe_ciphertext_in,
|
||||
ciphertext_modulus_64
|
||||
));
|
||||
|
||||
let d_lwe_ciphertext_in =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
|
||||
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
|
||||
output_lwe_dimension,
|
||||
LweCiphertextCount(1),
|
||||
ciphertext_modulus,
|
||||
&stream,
|
||||
);
|
||||
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
|
||||
|
||||
let mut test_vector_indexes: Vec<u64> = vec![0; number_of_messages];
|
||||
for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
|
||||
*ind = <usize as CastInto<u64>>::cast_into(i);
|
||||
}
|
||||
|
||||
let mut d_test_vector_indexes =
|
||||
unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
|
||||
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
|
||||
|
||||
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
|
||||
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
|
||||
let lwe_indexes = lwe_indexes_usize
|
||||
.iter()
|
||||
.map(|&x| <usize as CastInto<u64>>::cast_into(x))
|
||||
.collect_vec();
|
||||
let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
|
||||
let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
|
||||
unsafe {
|
||||
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
|
||||
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
|
||||
}
|
||||
|
||||
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
|
||||
&d_lwe_ciphertext_in,
|
||||
&mut d_out_pbs_ct,
|
||||
&d_accumulator,
|
||||
&d_test_vector_indexes,
|
||||
&d_output_indexes,
|
||||
&d_input_indexes,
|
||||
&d_bsk,
|
||||
&stream,
|
||||
);
|
||||
|
||||
let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&out_pbs_ct,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let decrypted = decrypt_lwe_ciphertext(&output_lwe_secret_key, &out_pbs_ct);
|
||||
|
||||
let decoded = round_decode(decrypted.0, delta) % msg_modulus.0 as u128;
|
||||
assert_eq!(decoded, f(msg as u128));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multibit_bootstrap_u128_with_squashing() {
|
||||
execute_multibit_bootstrap_u128(
|
||||
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
);
|
||||
}
|
||||
@@ -7,11 +7,13 @@ mod glwe_sample_extraction;
|
||||
mod lwe_keyswitch;
|
||||
mod lwe_linear_algebra;
|
||||
mod lwe_multi_bit_programmable_bootstrapping;
|
||||
mod lwe_multi_bit_programmable_bootstrapping_128;
|
||||
mod lwe_packing_keyswitch;
|
||||
mod lwe_programmable_bootstrapping;
|
||||
mod lwe_programmable_bootstrapping_128;
|
||||
mod modulus_switch_noise_reduction;
|
||||
mod noise_distribution;
|
||||
|
||||
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
|
||||
pub lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
|
||||
|
||||
@@ -10,9 +10,9 @@ use crate::core_crypto::prelude::{
|
||||
|
||||
/// A structure representing a vector of GLWE ciphertexts with 64 bits of precision on the GPU.
|
||||
#[derive(Debug)]
|
||||
pub struct CudaLweMultiBitBootstrapKey {
|
||||
pub struct CudaLweMultiBitBootstrapKey<Scalar: UnsignedInteger> {
|
||||
// Pointers to GPU data
|
||||
pub(crate) d_vec: CudaVec<u64>,
|
||||
pub(crate) d_vec: CudaVec<Scalar>,
|
||||
// Lwe dimension
|
||||
pub(crate) input_lwe_dimension: LweDimension,
|
||||
// Glwe dimension
|
||||
@@ -27,14 +27,11 @@ pub struct CudaLweMultiBitBootstrapKey {
|
||||
pub(crate) grouping_factor: LweBskGroupingFactor,
|
||||
}
|
||||
|
||||
impl CudaLweMultiBitBootstrapKey {
|
||||
pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container>(
|
||||
impl<Scalar: UnsignedInteger> CudaLweMultiBitBootstrapKey<Scalar> {
|
||||
pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container<Element = Scalar>>(
|
||||
bsk: &LweMultiBitBootstrapKey<InputBskCont>,
|
||||
streams: &CudaStreams,
|
||||
) -> Self
|
||||
where
|
||||
InputBskCont::Element: UnsignedInteger,
|
||||
{
|
||||
) -> Self {
|
||||
let input_lwe_dimension = bsk.input_lwe_dimension();
|
||||
let polynomial_size = bsk.polynomial_size();
|
||||
let decomp_level_count = bsk.decomposition_level_count();
|
||||
@@ -43,7 +40,7 @@ impl CudaLweMultiBitBootstrapKey {
|
||||
let grouping_factor = bsk.grouping_factor();
|
||||
|
||||
// Allocate memory
|
||||
let mut d_vec = CudaVec::<u64>::new_multi_gpu(
|
||||
let mut d_vec = CudaVec::<InputBskCont::Element>::new_multi_gpu(
|
||||
lwe_multi_bit_bootstrap_key_size(
|
||||
input_lwe_dimension,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
|
||||
@@ -14,9 +14,11 @@ use crate::core_crypto::prelude::{
|
||||
};
|
||||
pub use algorithms::*;
|
||||
pub use entities::*;
|
||||
use std::any::{Any, TypeId};
|
||||
use std::ffi::c_void;
|
||||
use tfhe_cuda_backend::bindings::*;
|
||||
use tfhe_cuda_backend::cuda_bind::*;
|
||||
|
||||
pub struct CudaStreams {
|
||||
pub ptr: Vec<*mut c_void>,
|
||||
pub gpu_indexes: Vec<GpuIndex>,
|
||||
@@ -311,15 +313,18 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
|
||||
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
|
||||
/// required
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
|
||||
pub unsafe fn programmable_bootstrap_multi_bit_async<
|
||||
T: UnsignedInteger,
|
||||
B: Any + UnsignedInteger,
|
||||
>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
lwe_array_out: &mut CudaVec<B>,
|
||||
output_indexes: &CudaVec<T>,
|
||||
test_vector: &CudaVec<T>,
|
||||
test_vector: &CudaVec<B>,
|
||||
test_vector_indexes: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
input_indexes: &CudaVec<T>,
|
||||
bootstrapping_key: &CudaVec<u64>,
|
||||
bootstrapping_key: &CudaVec<B>,
|
||||
lwe_dimension: LweDimension,
|
||||
glwe_dimension: GlweDimension,
|
||||
polynomial_size: PolynomialSize,
|
||||
@@ -331,6 +336,44 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
|
||||
let num_many_lut = 1u32;
|
||||
let lut_stride = 0u32;
|
||||
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
|
||||
if TypeId::of::<B>() == TypeId::of::<u128>() {
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
output_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
input_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_128(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else if TypeId::of::<B>() == TypeId::of::<u64>() {
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
@@ -367,6 +410,9 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else {
|
||||
panic!("Unsupported torus size")
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -607,9 +653,9 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
|
||||
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
|
||||
/// required
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: UnsignedInteger>(
|
||||
pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Any + UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
dest: &mut CudaVec<u64>,
|
||||
dest: &mut CudaVec<T>,
|
||||
src: &[T],
|
||||
input_lwe_dim: LweDimension,
|
||||
glwe_dim: GlweDimension,
|
||||
@@ -620,6 +666,20 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
|
||||
let size = std::mem::size_of_val(src);
|
||||
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
|
||||
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
|
||||
|
||||
if TypeId::of::<T>() == TypeId::of::<u128>() {
|
||||
cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
stream_ptr,
|
||||
streams.gpu_indexes[i].get(),
|
||||
dest.as_mut_c_ptr(i as u32),
|
||||
src.as_ptr().cast(),
|
||||
input_lwe_dim.0 as u32,
|
||||
glwe_dim.0 as u32,
|
||||
l_gadget.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
);
|
||||
} else if TypeId::of::<T>() == TypeId::of::<u64>() {
|
||||
cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
stream_ptr,
|
||||
streams.gpu_indexes[i].get(),
|
||||
@@ -631,6 +691,9 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
|
||||
polynomial_size.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
);
|
||||
} else {
|
||||
panic!("Unsupported torus size for bsk conversion")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ pub struct CudaCompressionKey {
|
||||
}
|
||||
|
||||
pub struct CudaDecompressionKey {
|
||||
pub blind_rotate_key: CudaBootstrappingKey,
|
||||
pub blind_rotate_key: CudaBootstrappingKey<u64>,
|
||||
pub lwe_per_glwe: LweCiphertextCount,
|
||||
pub glwe_dimension: GlweDimension,
|
||||
pub polynomial_size: PolynomialSize,
|
||||
|
||||
@@ -22,9 +22,9 @@ use crate::shortint::{CarryModulus, CiphertextModulus, MessageModulus, PBSOrder}
|
||||
|
||||
mod radix;
|
||||
|
||||
pub enum CudaBootstrappingKey {
|
||||
pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
|
||||
Classic(CudaLweBootstrapKey),
|
||||
MultiBit(CudaLweMultiBitBootstrapKey),
|
||||
MultiBit(CudaLweMultiBitBootstrapKey<Scalar>),
|
||||
}
|
||||
|
||||
/// A structure containing the server public key.
|
||||
@@ -34,7 +34,7 @@ pub enum CudaBootstrappingKey {
|
||||
// #[derive(PartialEq, Serialize, Deserialize)]
|
||||
pub struct CudaServerKey {
|
||||
pub key_switching_key: CudaLweKeyswitchKey<u64>,
|
||||
pub bootstrapping_key: CudaBootstrappingKey,
|
||||
pub bootstrapping_key: CudaBootstrappingKey<u64>,
|
||||
// Size of the message buffer
|
||||
pub message_modulus: MessageModulus,
|
||||
// Size of the carry buffer
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::core_crypto::prelude::*;
|
||||
use crate::shortint::parameters::noise_squashing::{
|
||||
NoiseSquashingCompressionParameters, NoiseSquashingParameters,
|
||||
NoiseSquashingCompressionParameters, NoiseSquashingMultiBitParameters, NoiseSquashingParameters,
|
||||
};
|
||||
use crate::shortint::parameters::{
|
||||
CoreCiphertextModulus, ModulusSwitchNoiseReductionParams, ModulusSwitchType,
|
||||
@@ -69,3 +69,8 @@ pub enum NoiseSquashingParametersVersions {
|
||||
pub enum NoiseSquashingCompressionParametersVersions {
|
||||
V0(NoiseSquashingCompressionParameters),
|
||||
}
|
||||
|
||||
#[derive(VersionsDispatch)]
|
||||
pub enum NoiseSquashingMultiBitParametersVersions {
|
||||
V0(NoiseSquashingMultiBitParameters),
|
||||
}
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use super::atomic_pattern::AtomicPatternParameters;
|
||||
use crate::keycache::utils::named_params_impl;
|
||||
use crate::keycache::*;
|
||||
#[cfg(tarpaulin)]
|
||||
use crate::shortint::parameters::coverage_parameters::*;
|
||||
use crate::shortint::parameters::current_params::*;
|
||||
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
|
||||
use crate::shortint::parameters::parameters_wopbs::*;
|
||||
use crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use crate::shortint::parameters::*;
|
||||
use crate::shortint::wopbs::WopbsKey;
|
||||
use crate::shortint::{ClientKey, KeySwitchingKey, ServerKey};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::atomic_pattern::AtomicPatternParameters;
|
||||
|
||||
named_params_impl!( ShortintParameterSet =>
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M128,
|
||||
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
|
||||
@@ -492,6 +493,10 @@ named_params_impl!( NoiseSquashingParameters =>
|
||||
V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
);
|
||||
|
||||
named_params_impl!( NoiseSquashingMultiBitParameters =>
|
||||
V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
);
|
||||
|
||||
impl From<AtomicPatternParameters> for (ClientKey, ServerKey) {
|
||||
fn from(param: AtomicPatternParameters) -> Self {
|
||||
let param_set = ShortintParameterSet::from(param);
|
||||
|
||||
@@ -46,8 +46,13 @@ use current_params::multi_bit::tuniform::p_fail_2_minus_64::ks_pbs_gpu::{
|
||||
};
|
||||
use current_params::noise_squashing::p_fail_2_minus_128::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
use super::current_params::V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use super::current_params::{
|
||||
V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
use super::NoiseSquashingCompressionParameters;
|
||||
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
|
||||
|
||||
// Aliases
|
||||
|
||||
// Compute Gaussian
|
||||
@@ -125,6 +130,10 @@ pub const NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingParameters =
|
||||
V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingMultiBitParameters =
|
||||
V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
// GPU 2^-64
|
||||
// GPU TUniform
|
||||
pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M64:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::shortint::backward_compatibility::parameters::noise_squashing::*;
|
||||
use crate::shortint::parameters::{
|
||||
CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
|
||||
@@ -34,3 +35,17 @@ pub struct NoiseSquashingCompressionParameters {
|
||||
pub carry_modulus: CarryModulus,
|
||||
pub ciphertext_modulus: CoreCiphertextModulus<u128>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize, Versionize)]
|
||||
#[versionize(NoiseSquashingMultiBitParametersVersions)]
|
||||
pub struct NoiseSquashingMultiBitParameters {
|
||||
pub glwe_dimension: GlweDimension,
|
||||
pub polynomial_size: PolynomialSize,
|
||||
pub glwe_noise_distribution: DynamicDistribution<u128>,
|
||||
pub decomp_base_log: DecompositionBaseLog,
|
||||
pub decomp_level_count: DecompositionLevelCount,
|
||||
pub grouping_factor: LweBskGroupingFactor,
|
||||
pub message_modulus: MessageModulus,
|
||||
pub carry_modulus: CarryModulus,
|
||||
pub ciphertext_modulus: CoreCiphertextModulus<u128>,
|
||||
}
|
||||
|
||||
@@ -43,6 +43,7 @@ pub use noise_squashing::p_fail_2_minus_128::*;
|
||||
#[cfg(feature = "hpu")]
|
||||
pub use hpu::*;
|
||||
|
||||
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
|
||||
use crate::shortint::parameters::{
|
||||
ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
|
||||
KeySwitch32PBSParameters, MultiBitPBSParameters, NoiseSquashingCompressionParameters,
|
||||
@@ -1700,6 +1701,14 @@ pub const VEC_ALL_NOISE_SQUASHING_PARAMETERS: [(&NoiseSquashingParameters, &str)
|
||||
),
|
||||
];
|
||||
|
||||
pub const VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS: [(
|
||||
&NoiseSquashingMultiBitParameters,
|
||||
&str,
|
||||
); 1] = [(
|
||||
&V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
"V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128",
|
||||
)];
|
||||
|
||||
/// All [`NoiseSquashingCompressionParameters`] in this module.
|
||||
pub const VEC_ALL_NOISE_SQUASHING_COMPRESSION_PARAMETERS: [(
|
||||
&NoiseSquashingCompressionParameters,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
|
||||
use crate::shortint::parameters::{
|
||||
CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
|
||||
DynamicDistribution, GlweDimension, LweCiphertextCount, MessageModulus,
|
||||
DynamicDistribution, GlweDimension, LweBskGroupingFactor, LweCiphertextCount, MessageModulus,
|
||||
ModulusSwitchNoiseReductionParams, ModulusSwitchType, NoiseEstimationMeasureBound,
|
||||
NoiseSquashingCompressionParameters, NoiseSquashingParameters, PolynomialSize, RSigmaFactor,
|
||||
Variance,
|
||||
@@ -58,3 +59,16 @@ pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
|
||||
carry_modulus: CarryModulus(4),
|
||||
ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
|
||||
};
|
||||
|
||||
pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingMultiBitParameters = NoiseSquashingMultiBitParameters {
|
||||
glwe_dimension: GlweDimension(2),
|
||||
polynomial_size: PolynomialSize(2048),
|
||||
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
|
||||
decomp_base_log: DecompositionBaseLog(23),
|
||||
decomp_level_count: DecompositionLevelCount(3),
|
||||
grouping_factor: LweBskGroupingFactor(4),
|
||||
message_modulus: MessageModulus(4),
|
||||
carry_modulus: CarryModulus(4),
|
||||
ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user