feat(gpu): implement 128-bit multi-bit PBS

This commit is contained in:
Pedro Alves
2025-06-11 11:56:06 -03:00
committed by Pedro Alves
parent e1beea5ecb
commit 8c88678ee8
35 changed files with 2751 additions and 156 deletions

View File

@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t num_many_lut, uint32_t lut_stride);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
uint32_t polynomial_size);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
uint32_t polynomial_size);
template <typename Torus>
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size);
uint32_t polynomial_size,
uint64_t full_sm_keybundle);
template <typename Torus, class params>
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
}
};
template <typename InputTorus>
struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_keybundle = NULL;
int8_t *d_mem_acc_step_one = NULL;
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
uint32_t lwe_chunk_size;
double *keybundle_fft;
__uint128_t *global_accumulator;
double *global_join_buffer;
PBS_VARIANT pbs_variant;
bool gpu_memory_allocated;
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory, uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
this->lwe_chunk_size = lwe_chunk_size;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// default
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
__uint128_t>(polynomial_size);
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
__uint128_t>(polynomial_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
__uint128_t>(polynomial_size);
uint64_t partial_sm_accumulate_step_one =
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
__uint128_t>(polynomial_size);
// cg
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
polynomial_size);
uint64_t partial_sm_cg_accumulate =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
__uint128_t>(polynomial_size);
auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
(glwe_dimension + 1) * (glwe_dimension + 1) *
level_count;
auto num_blocks_acc_step_one =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
auto num_blocks_acc_step_two =
input_lwe_ciphertext_count * (glwe_dimension + 1);
auto num_blocks_acc_cg =
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
// Keybundle
if (max_shared_memory < full_sm_keybundle)
d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
size_tracker, allocate_gpu_memory);
switch (pbs_variant) {
case PBS_VARIANT::CG:
// Accumulator CG
if (max_shared_memory < partial_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
size_tracker, allocate_gpu_memory);
else if (max_shared_memory < full_sm_cg_accumulate)
d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
size_tracker, allocate_gpu_memory);
break;
case PBS_VARIANT::DEFAULT:
// Accumulator step one
if (max_shared_memory < partial_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
gpu_index, size_tracker, allocate_gpu_memory);
else if (max_shared_memory < full_sm_accumulate_step_one)
d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
gpu_index, size_tracker, allocate_gpu_memory);
// Accumulator step two
if (max_shared_memory < full_sm_accumulate_step_two)
d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
gpu_index, size_tracker, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
stream, gpu_index, size_tracker, allocate_gpu_memory);
global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
sizeof(__uint128_t),
stream, gpu_index, size_tracker, allocate_gpu_memory);
global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
(polynomial_size / 2) * 4 * sizeof(double),
stream, gpu_index, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t stream, uint32_t gpu_index) {
if (d_mem_keybundle)
cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
gpu_memory_allocated);
switch (pbs_variant) {
case DEFAULT:
if (d_mem_acc_step_one)
cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
gpu_index, gpu_memory_allocated);
if (d_mem_acc_step_two)
cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
gpu_index, gpu_memory_allocated);
break;
case CG:
if (d_mem_acc_cg)
cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
gpu_memory_allocated);
break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
gpu_memory_allocated);
}
};
#endif // CUDA_MULTI_BIT_UTILITIES_H

View File

@@ -240,7 +240,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
}
};
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
template <typename InputTorus>
struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
int8_t *d_mem;
__uint128_t *global_accumulator;

View File

@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
const uint32_t gpu_index,
int8_t **buffer);
}
#endif // CUDA_MULTI_BIT_H

View File

@@ -1,5 +1,6 @@
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
pbs/programmable_bootstrap_multibit_128.cu)
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)

View File

@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
r = __ll2double_rn(x);
}
template <>
__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
double &r) {
// We truncate x
r = __ll2double_rn(static_cast<uint64_t>(x));
}
template <typename T>
__device__ inline T init_decomposer_state(T input, uint32_t base_log,
uint32_t level_count) {

View File

@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
}
}
// params is expected to be full degree not half degree
// same as convert_u128_to_f128_as_torus() but expects input to be on registers
template <class params>
__device__ void convert_u128_on_regs_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
const double normalization = pow(2., -128.);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
out_re_hi[tid] = out_re.hi * normalization;
out_re_lo[tid] = out_re.lo * normalization;
out_im_hi[tid] = out_im.hi * normalization;
out_im_lo[tid] = out_im.lo * normalization;
tid += params::degree / params::opt;
}
}
template <class params>
__device__ void
convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
}
// params is expected to be full degree not half degree
// converts standqard input into complex<128> represented by 4 double
// converts standard input into complex<128> represented by 4 double
// with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
// ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n]
template <class params>
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
}
// params is expected to be full degree not half degree
// converts standqard input into complex<128> represented by 4 double
// converts standard input into complex<128> represented by 4 double
// with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
// re_lo_1, im_hi_1, im_lo_1,
// ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]

View File

@@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
static_cast<cudaStream_t>(stream), gpu_index);
}
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size =
total_polynomials * polynomial_size * sizeof(__uint128_t);
cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
static_cast<cudaStream_t>(stream), gpu_index);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ const uint64_t *
get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int glwe_dimension,
uint32_t level_count);
template __device__ const __uint128_t *
get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
int k, int level,
uint32_t grouping_factor,
uint32_t polynomial_size,
uint32_t glwe_dimension,
uint32_t level_count);
template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
const uint64_t *ptr, int g, int i, int k, int level,
uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,

View File

@@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
__syncthreads();
}
/** Perform the matrix multiplication between the GGSW and the GLWE,
* each block operating on a single level for mask and body.
* Both operands should be at fourier domain
*
* This function assumes:
* - Thread blocks at dimension z relates to the decomposition level.
* - Thread blocks at dimension y relates to the glwe dimension.
* - polynomial_size / params::opt threads are available per block
*/
template <typename G, class params>
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
double *fft, double *join_buffer,
const double *__restrict__ bootstrapping_key, int iteration, G &group,
bool support_dsm = false) {
const uint32_t polynomial_size = params::degree;
const uint32_t glwe_dimension = gridDim.y - 1;
const uint32_t level_count = gridDim.z;
// The first product is used to initialize level_join_buffer
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
auto bsk_slice = get_ith_mask_kth_block_128(
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
glwe_dimension, level_count);
for (int j = 0; j < glwe_dimension + 1; j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
auto buffer_slice = get_join_buffer_element_128<G>(
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_product_accumulate_in_fourier_domain_128<params>(
buffer_slice, fft, bsk_poly, j == 0);
group.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
// accumulate rest of the products into fft buffer
for (int l = 0; l < level_count; l++) {
auto cur_src_acc = get_join_buffer_element_128<G>(
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
l == 0);
}
__syncthreads();
}
template <typename Torus>
void execute_pbs_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,

View File

@@ -280,8 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,

View File

@@ -18,62 +18,6 @@
#include "programmable_bootstrap.cuh"
#include "types/complex/operations.cuh"
/** Perform the matrix multiplication between the GGSW and the GLWE,
* each block operating on a single level for mask and body.
* Both operands should be at fourier domain
*
* This function assumes:
* - Thread blocks at dimension z relates to the decomposition level.
* - Thread blocks at dimension y relates to the glwe dimension.
* - polynomial_size / params::opt threads are available per block
*/
template <typename G, class params>
__device__ void mul_ggsw_glwe_in_fourier_domain_128(
double *fft, double *join_buffer,
const double *__restrict__ bootstrapping_key, int iteration, G &group,
bool support_dsm = false) {
const uint32_t polynomial_size = params::degree;
const uint32_t glwe_dimension = gridDim.y - 1;
const uint32_t level_count = gridDim.z;
// The first product is used to initialize level_join_buffer
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
auto bsk_slice = get_ith_mask_kth_block_128(
bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
glwe_dimension, level_count);
for (int j = 0; j < glwe_dimension + 1; j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
auto buffer_slice = get_join_buffer_element_128<G>(
blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_product_accumulate_in_fourier_domain_128<params>(
buffer_slice, fft, bsk_poly, j == 0);
group.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
// accumulate rest of the products into fft buffer
for (int l = 0; l < level_count; l++) {
auto cur_src_acc = get_join_buffer_element_128<G>(
l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
support_dsm);
polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
l == 0);
}
__syncthreads();
}
template <typename InputTorus, class params, sharedMemDegree SMD,
bool first_iter>
__global__ void __launch_bounds__(params::degree / params::opt)
@@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
accumulator);
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
// Switch to the FFT space
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;

View File

@@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size) {
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
uint32_t polynomial_size,
uint64_t full_sm_keybundle) {
int max_blocks_per_sm;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

View File

@@ -521,8 +521,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,

View File

@@ -0,0 +1,361 @@
#include "programmable_bootstrap_cg_multibit.cuh"
#include "programmable_bootstrap_multibit_128.cuh"
template <typename InputTorus>
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
void *stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
template <typename InputTorus>
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
void *stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<
__uint128_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (supports_cg)
return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
stream, gpu_index,
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
stream, gpu_index,
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
}
template <typename InputTorus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
template <typename InputTorus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
switch (polynomial_size) {
case 256:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
auto *buffer =
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
switch (buffer->pbs_variant) {
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
const uint32_t gpu_index,
int8_t **buffer) {
const auto x =
reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
}
/**
* Computes divisors of the product of num_sms (streaming multiprocessors on the
* GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
* device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
* root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
* suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
* - 13) for additional logic.
*
* The value 13 was empirically determined based on memory requirements for
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size,
uint64_t full_sm_keybundle) {
int max_blocks_per_sm;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
NOSM>,
polynomial_size / params::opt, full_sm_keybundle);
else
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
FULLSM>,
polynomial_size / params::opt, 0);
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
int x = num_sms * max_blocks_per_sm;
int count = 0;
int divisor = 1;
int ith_divisor = 0;
#if CUDA_ARCH < 900
// We pick a smaller divisor on GPUs other than H100, so 256-bit integer
// multiplication can run
int log2_max_num_pbs = log2_int(max_num_pbs);
if (log2_max_num_pbs > 13)
ith_divisor = log2_max_num_pbs - 11;
#endif
for (int i = sqrt(x); i >= 1; i--) {
if (x % i == 0) {
if (count == ith_divisor) {
divisor = i;
break;
} else {
count++;
}
}
}
return divisor;
}

File diff suppressed because it is too large Load Diff

View File

@@ -283,8 +283,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size =
get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
polynomial_size, full_sm_keybundle);
uint64_t size_tracker = 0;
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,

View File

@@ -5,15 +5,15 @@
#include <stdio.h>
#include <type_traits>
template <typename T> inline __device__ const char *get_format();
template <typename T> __device__ inline const char *get_format();
template <> inline __device__ const char *get_format<int>() { return "%d, "; }
template <> __device__ inline const char *get_format<int>() { return "%d, "; }
template <> inline __device__ const char *get_format<unsigned int>() {
template <> __device__ inline const char *get_format<unsigned int>() {
return "%u, ";
}
template <> inline __device__ const char *get_format<uint64_t>() {
template <> __device__ inline const char *get_format<uint64_t>() {
return "%lu, ";
}
@@ -23,6 +23,15 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
}
}
template <>
__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
for (int i = 0; i < N; i++) {
uint64_t low = static_cast<uint64_t>(src[i]);
uint64_t high = static_cast<uint64_t>(src[i] >> 64);
printf("(%llu, %llu), ", high, low);
}
}
template <>
__global__ inline void print_debug_kernel(const double2 *src, int N) {
for (int i = 0; i < N; i++) {

View File

@@ -2188,6 +2188,19 @@ unsafe extern "C" {
grouping_factor: u32,
);
}
unsafe extern "C" {
pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
stream: *mut ffi::c_void,
gpu_index: u32,
dest: *mut ffi::c_void,
src: *const ffi::c_void,
input_lwe_dim: u32,
glwe_dim: u32,
level_count: u32,
polynomial_size: u32,
grouping_factor: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
stream: *mut ffi::c_void,
@@ -2230,3 +2243,45 @@ unsafe extern "C" {
pbs_buffer: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
stream: *mut ffi::c_void,
gpu_index: u32,
buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
mem_ptr: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
grouping_factor: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128(
stream: *mut ffi::c_void,
gpu_index: u32,
buffer: *mut *mut i8,
);
}

View File

@@ -165,7 +165,7 @@ fn pbs_128(c: &mut Criterion) {
mod cuda {
use benchmark::utilities::{
cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord,
write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
CudaLocalKeys, OperatorType,
};
use criterion::{black_box, Criterion, Throughput};
@@ -173,12 +173,14 @@ mod cuda {
use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use tfhe::core_crypto::gpu::{
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext,
cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams,
};
use tfhe::core_crypto::prelude::*;
use tfhe::shortint::engine::ShortintEngine;
use tfhe::shortint::parameters::{
ModulusSwitchType, NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
@@ -441,14 +443,281 @@ mod cuda {
);
}
fn cuda_multi_bit_pbs_128(c: &mut Criterion) {
let bench_name = "core_crypto::cuda::multi_bit_pbs128";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
type Scalar = u128;
let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let squash_params =
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
let params_name = "PARAMS_SWITCH_SQUASH";
let mut boxed_seeder = new_seeder();
let seeder = boxed_seeder.as_mut();
let mut secret_generator =
SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
let mut encryption_generator =
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
let input_lwe_secret_key =
LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);
let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
squash_params.glwe_dimension,
squash_params.polynomial_size,
&mut secret_generator,
);
let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
let multi_bit_bsk = LweMultiBitBootstrapKey::new(
Scalar::ZERO,
squash_params.glwe_dimension.to_glwe_size(),
squash_params.polynomial_size,
squash_params.decomp_base_log,
squash_params.decomp_level_count,
input_params.lwe_dimension,
squash_params.grouping_factor,
squash_params.ciphertext_modulus,
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
.multi_bit_bootstrap_key(multi_bit_bsk)
.build();
let message_modulus: u64 = 1 << 4;
let input_message: u64 = 3;
let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
let plaintext = Plaintext(input_message * delta);
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
let streams = CudaStreams::new_multi_gpu();
let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
let lwe_ciphertext_in: LweCiphertextOwned<u64> =
allocate_and_encrypt_new_lwe_ciphertext(
&input_lwe_secret_key,
plaintext,
lwe_noise_distribution_u64,
ct_modulus_u64,
&mut encryption_generator,
);
let lwe_ciphertext_in_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams);
let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
Scalar::ONE,
squash_params.glwe_dimension.to_glwe_size(),
squash_params.polynomial_size,
squash_params.ciphertext_modulus,
);
let accumulator_gpu =
CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
let out_pbs_ct = LweCiphertext::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
squash_params.ciphertext_modulus,
);
let mut out_pbs_ct_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
let h_indexes = [0];
let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
bench_id = format!("{bench_name}::{params_name}");
{
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
&lwe_ciphertext_in_gpu,
&mut out_pbs_ct_gpu,
&accumulator_gpu,
&cuda_indexes.d_lut,
&cuda_indexes.d_output,
&cuda_indexes.d_input,
gpu_keys.multi_bit_bsk.as_ref().unwrap(),
&streams,
);
black_box(&mut out_pbs_ct_gpu);
})
});
}
}
BenchmarkType::Throughput => {
let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
let gpu_count = get_number_of_gpus() as usize;
bench_id = format!("{bench_name}::throughput::{params_name}");
let blocks: usize = 1;
let elements = throughput_num_threads(blocks, 1);
let elements_per_stream = elements as usize / gpu_count;
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let setup_encrypted_values = || {
let local_streams = cuda_local_streams_core();
let plaintext_list =
PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));
let input_cts = (0..gpu_count)
.map(|i| {
let mut input_ct_list = LweCiphertextList::new(
u64::ZERO,
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
ct_modulus_u64,
);
encrypt_lwe_ciphertext_list(
&input_lwe_secret_key,
&mut input_ct_list,
&plaintext_list,
lwe_noise_distribution_u64,
&mut encryption_generator,
);
CudaLweCiphertextList::from_lwe_ciphertext_list(
&input_ct_list,
&local_streams[i],
)
})
.collect::<Vec<_>>();
let accumulators = (0..gpu_count)
.map(|i| {
let accumulator = GlweCiphertextOwned::new(
Scalar::ONE,
squash_params.glwe_dimension.to_glwe_size(),
squash_params.polynomial_size,
squash_params.ciphertext_modulus,
);
CudaGlweCiphertextList::from_glwe_ciphertext(
&accumulator,
&local_streams[i],
)
})
.collect::<Vec<_>>();
// Allocate the LweCiphertext to store the result of the PBS
let output_cts = (0..gpu_count)
.map(|i| {
let output_ct_list = LweCiphertextList::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
squash_params.ciphertext_modulus,
);
CudaLweCiphertextList::from_lwe_ciphertext_list(
&output_ct_list,
&local_streams[i],
)
})
.collect::<Vec<_>>();
let h_indexes = (0..(elements / gpu_count as u64))
.map(CastFrom::cast_from)
.collect::<Vec<_>>();
let cuda_indexes_vec = (0..gpu_count)
.map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
.collect::<Vec<_>>();
local_streams.iter().for_each(|stream| stream.synchronize());
(
input_cts,
output_cts,
accumulators,
cuda_indexes_vec,
local_streams,
)
};
b.iter_batched(
setup_encrypted_values,
|(
input_cts,
mut output_cts,
accumulators,
cuda_indexes_vec,
local_streams,
)| {
(0..gpu_count)
.into_par_iter()
.zip(input_cts.par_iter())
.zip(output_cts.par_iter_mut())
.zip(accumulators.par_iter())
.zip(local_streams.par_iter())
.for_each(
|((((i, input_ct), output_ct), accumulator), local_stream)| {
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
input_ct,
output_ct,
accumulator,
&cuda_indexes_vec[i].d_lut,
&cuda_indexes_vec[i].d_output,
&cuda_indexes_vec[i].d_input,
gpu_keys_vec[i].multi_bit_bsk.as_ref().unwrap(),
local_stream,
);
},
)
},
criterion::BatchSize::SmallInput,
);
});
}
};
let params_record = CryptoParametersRecord {
lwe_dimension: Some(input_params.lwe_dimension),
glwe_dimension: Some(squash_params.glwe_dimension),
polynomial_size: Some(squash_params.polynomial_size),
lwe_noise_distribution: Some(lwe_noise_distribution_u64),
glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
pbs_base_log: Some(squash_params.decomp_base_log),
pbs_level: Some(squash_params.decomp_level_count),
ciphertext_modulus: Some(input_params.ciphertext_modulus),
..Default::default()
};
let bit_size = (message_modulus as u32).ilog2();
write_to_json(
&bench_id,
params_record,
params_name,
"pbs",
&OperatorType::Atomic,
bit_size,
vec![bit_size],
);
}
pub fn cuda_pbs128_group() {
let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
cuda_pbs_128(&mut criterion);
}
pub fn cuda_multi_bit_pbs128_group() {
let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
cuda_multi_bit_pbs_128(&mut criterion);
}
}
#[cfg(feature = "gpu")]
use cuda::cuda_pbs128_group;
use cuda::{cuda_multi_bit_pbs128_group, cuda_pbs128_group};
pub fn pbs128_group() {
let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
@@ -458,6 +727,7 @@ pub fn pbs128_group() {
#[cfg(feature = "gpu")]
fn go_through_gpu_bench_groups() {
cuda_pbs128_group();
cuda_multi_bit_pbs128_group();
}
#[cfg(not(feature = "gpu"))]

View File

@@ -521,7 +521,7 @@ mod cuda_utils {
pub ksk: Option<CudaLweKeyswitchKey<T>>,
pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
pub bsk: Option<CudaLweBootstrapKey>,
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey>,
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
}
#[allow(dead_code)]

View File

@@ -13,6 +13,8 @@ use tfhe::shortint::parameters::current_params::{
VEC_ALL_COMPRESSION_PARAMETERS, VEC_ALL_HPU_PARAMETERS, VEC_ALL_KS32_PARAMETERS,
VEC_ALL_MULTI_BIT_PBS_PARAMETERS, VEC_ALL_NOISE_SQUASHING_PARAMETERS,
};
use tfhe::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
use tfhe::shortint::parameters::v1_3::VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS;
use tfhe::shortint::parameters::{
CompactPublicKeyEncryptionParameters, CompressionParameters, NoiseSquashingParameters,
};
@@ -214,6 +216,36 @@ impl ParamDetails<u128> for NoiseSquashingParameters {
}
}
impl ParamDetails<u128> for NoiseSquashingMultiBitParameters {
fn lwe_dimension(&self) -> LweDimension {
panic!("lwe_dimension not applicable for NoiseSquashingMultiBitParameters")
}
fn glwe_dimension(&self) -> GlweDimension {
self.glwe_dimension
}
fn lwe_noise_distribution(&self) -> DynamicDistribution<u128> {
panic!("lwe_noise_distribution not applicable for NoiseSquashingMultiBitParameters")
}
fn glwe_noise_distribution(&self) -> DynamicDistribution<u128> {
self.glwe_noise_distribution
}
fn polynomial_size(&self) -> PolynomialSize {
self.polynomial_size
}
fn lwe_ciphertext_modulus(&self) -> ParamModulus {
panic!("lwe_ciphertext_modulus not applicable for NoiseSquashingMultiBitParameters")
}
fn glwe_ciphertext_modulus(&self) -> ParamModulus {
ParamModulus::from_ciphertext_modulus(self.ciphertext_modulus)
}
}
#[derive(Eq, PartialEq, Hash)]
enum ParametersFormat {
Lwe,
@@ -493,6 +525,16 @@ fn main() {
ParametersFormat::Glwe,
);
let noise_squasing_multi_bit_params: Vec<_> = VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS
.into_iter()
.map(|p| (*p.0, Some(p.1)))
.collect();
write_all_params_in_file(
"shortint_noise_squashing_multi_bit_parameters_lattice_estimator.sage",
&noise_squasing_multi_bit_params,
ParametersFormat::Glwe,
);
let ks32_params: Vec<_> = VEC_ALL_KS32_PARAMETERS
.into_iter()
.map(|p| (AtomicPatternParameters::from(*p.0), Some(p.1)))

View File

@@ -178,6 +178,36 @@ pub const DUMMY_31_U32: ClassicTestParams<u32> = ClassicTestParams {
ciphertext_modulus: CiphertextModulus::new(1 << 31),
};
#[cfg(feature = "gpu")]
pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingMultiBitTestParameters<u128> = NoiseSquashingMultiBitTestParameters {
glwe_dimension: GlweDimension(2),
polynomial_size: PolynomialSize(2048),
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
decomp_base_log: DecompositionBaseLog(23),
decomp_level_count: DecompositionLevelCount(3),
grouping_factor: LweBskGroupingFactor(4),
message_modulus_log: MessageModulusLog(4),
ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
};
#[cfg(feature = "gpu")]
pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: MultiBitTestParams<
u64,
> = MultiBitTestParams {
input_lwe_dimension: LweDimension(920),
lwe_noise_distribution: DynamicDistribution::new_t_uniform(45),
decomp_base_log: DecompositionBaseLog(22),
decomp_level_count: DecompositionLevelCount(1),
glwe_dimension: GlweDimension(1),
polynomial_size: PolynomialSize(2048),
glwe_noise_distribution: DynamicDistribution::new_t_uniform(17),
message_modulus_log: MessageModulusLog(4),
ciphertext_modulus: CiphertextModulus::new_native(),
grouping_factor: LweBskGroupingFactor(4),
thread_count: ThreadCount(5),
};
pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
input_lwe_dimension: LweDimension(818),
lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(

View File

@@ -3,6 +3,8 @@ use crate::core_crypto::entities::*;
use crate::core_crypto::prelude::{CastFrom, CastInto, UnsignedInteger};
use crate::keycache::NamedParam;
#[cfg(feature = "gpu")]
use crate::shortint::parameters::ModulusSwitchNoiseReductionParams;
#[cfg(feature = "gpu")]
use crate::shortint::parameters::ModulusSwitchType;
use serde::{Deserialize, Serialize};
@@ -22,6 +24,13 @@ pub struct MultiBitBootstrapKeys<Scalar: UnsignedInteger> {
pub fbsk: FourierLweMultiBitBootstrapKeyOwned,
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct MultiBitStdBootstrapKeys<Scalar: UnsignedInteger> {
pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
pub bsk: LweMultiBitBootstrapKeyOwned<Scalar>,
}
// Fourier key is generated afterward in order to use generic test function
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct FftBootstrapKeys<Scalar: UnsignedInteger> {
@@ -81,6 +90,18 @@ pub struct MultiBitTestParams<Scalar: UnsignedInteger> {
pub thread_count: ThreadCount,
}
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct NoiseSquashingMultiBitTestParameters<Scalar: UnsignedInteger> {
pub glwe_dimension: GlweDimension,
pub polynomial_size: PolynomialSize,
pub glwe_noise_distribution: DynamicDistribution<Scalar>,
pub decomp_base_log: DecompositionBaseLog,
pub decomp_level_count: DecompositionLevelCount,
pub grouping_factor: LweBskGroupingFactor,
pub message_modulus_log: MessageModulusLog,
pub ciphertext_modulus: CiphertextModulus<Scalar>,
}
// PartialEq is implemented manually because thread_count doesn't affect key generation and we want
// to change its value in test without the need of regenerating keys in the key cache.
impl<Scalar: UnsignedInteger> PartialEq for MultiBitTestParams<Scalar> {
@@ -141,6 +162,21 @@ pub struct NoiseSquashingTestParams<Scalar: UnsignedInteger> {
pub modulus_switch_noise_reduction_params: ModulusSwitchType,
pub ciphertext_modulus: CiphertextModulus<Scalar>,
}
// Parameters to test NoiseSquashing implementation
#[cfg(feature = "gpu")]
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
pub struct NoiseSquashingMultiBitTestParams<Scalar: UnsignedInteger> {
pub lwe_dimension: LweDimension,
pub glwe_dimension: GlweDimension,
pub polynomial_size: PolynomialSize,
pub lwe_noise_distribution: DynamicDistribution<Scalar>,
pub glwe_noise_distribution: DynamicDistribution<Scalar>,
pub pbs_base_log: DecompositionBaseLog,
pub pbs_level: DecompositionLevelCount,
pub grouping_factor: LweBskGroupingFactor,
pub modulus_switch_noise_reduction_params: Option<ModulusSwitchNoiseReductionParams>,
pub ciphertext_modulus: CiphertextModulus<Scalar>,
}
#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PackingKeySwitchTestParams<Scalar: UnsignedInteger> {

View File

@@ -17,7 +17,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
lut_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
input_indexes: &CudaVec<Scalar>,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
@@ -151,7 +151,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
lut_indexes: &CudaVec<Scalar>,
output_indexes: &CudaVec<Scalar>,
input_indexes: &CudaVec<Scalar>,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
@@ -171,3 +171,162 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
}
streams.synchronize();
}
/// # Safety
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronised
#[allow(clippy::too_many_arguments)]
pub unsafe fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async<OutputScalar>(
input: &CudaLweCiphertextList<u64>,
output: &mut CudaLweCiphertextList<OutputScalar>,
accumulator: &CudaGlweCiphertextList<OutputScalar>,
lut_indexes: &CudaVec<u64>,
output_indexes: &CudaVec<u64>,
input_indexes: &CudaVec<u64>,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<OutputScalar>,
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
OutputScalar: UnsignedTorus + CastInto<usize>,
{
assert_eq!(
input.lwe_dimension(),
multi_bit_bsk.input_lwe_dimension(),
"Mismatched input LweDimension. LweCiphertext input LweDimension {:?}. \
FourierLweMultiBitBootstrapKey input LweDimension {:?}.",
input.lwe_dimension(),
multi_bit_bsk.input_lwe_dimension(),
);
assert_eq!(
output.lwe_dimension(),
multi_bit_bsk.output_lwe_dimension(),
"Mismatched output LweDimension. LweCiphertext output LweDimension {:?}. \
FourierLweMultiBitBootstrapKey output LweDimension {:?}.",
output.lwe_dimension(),
multi_bit_bsk.output_lwe_dimension(),
);
assert_eq!(
accumulator.glwe_dimension(),
multi_bit_bsk.glwe_dimension(),
"Mismatched GlweSize. Accumulator GlweSize {:?}. \
FourierLweMultiBitBootstrapKey GlweSize {:?}.",
accumulator.glwe_dimension(),
multi_bit_bsk.glwe_dimension(),
);
assert_eq!(
accumulator.polynomial_size(),
multi_bit_bsk.polynomial_size(),
"Mismatched PolynomialSize. Accumulator PolynomialSize {:?}. \
FourierLweMultiBitBootstrapKey PolynomialSize {:?}.",
accumulator.polynomial_size(),
multi_bit_bsk.polynomial_size(),
);
assert_eq!(
output.ciphertext_modulus(),
accumulator.ciphertext_modulus(),
"Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
input.ciphertext_modulus(),
accumulator.ciphertext_modulus(),
);
assert_eq!(
streams.gpu_indexes[0],
multi_bit_bsk.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
streams.gpu_indexes[0].get(),
multi_bit_bsk.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
input.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
streams.gpu_indexes[0].get(),
input.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
output.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
streams.gpu_indexes[0].get(),
output.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
accumulator.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first accumulator pointer is on GPU {}",
streams.gpu_indexes[0].get(),
accumulator.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
input_indexes.gpu_index(0),
"GPU error: first stream is on GPU {}, first input indexes pointer is on GPU {}",
streams.gpu_indexes[0].get(),
input_indexes.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
output_indexes.gpu_index(0),
"GPU error: first stream is on GPU {}, first output indexes pointer is on GPU {}",
streams.gpu_indexes[0].get(),
output_indexes.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
lut_indexes.gpu_index(0),
"GPU error: first stream is on GPU {}, first lut indexes pointer is on GPU {}",
streams.gpu_indexes[0].get(),
lut_indexes.gpu_index(0).get(),
);
programmable_bootstrap_multi_bit_async(
streams,
&mut output.0.d_vec,
output_indexes,
&accumulator.0.d_vec,
lut_indexes,
&input.0.d_vec,
input_indexes,
&multi_bit_bsk.d_vec,
input.lwe_dimension(),
multi_bit_bsk.glwe_dimension(),
multi_bit_bsk.polynomial_size(),
multi_bit_bsk.decomp_base_log(),
multi_bit_bsk.decomp_level_count(),
multi_bit_bsk.grouping_factor(),
input.lwe_ciphertext_count().0 as u32,
);
}
#[allow(clippy::too_many_arguments)]
pub fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext<Scalar>(
input: &CudaLweCiphertextList<u64>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<u64>,
output_indexes: &CudaVec<u64>,
input_indexes: &CudaVec<u64>,
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
unsafe {
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async(
input,
output,
accumulator,
lut_indexes,
output_indexes,
input_indexes,
multi_bit_bsk,
streams,
);
}
streams.synchronize();
}

View File

@@ -0,0 +1,187 @@
use super::*;
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
use crate::core_crypto::gpu::{
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext, CudaStreams,
};
use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
use itertools::Itertools;
fn execute_multibit_bootstrap_u128(
squash_params: NoiseSquashingMultiBitTestParameters<u128>,
input_params: MultiBitTestParams<u64>,
) {
let input_lwe_dimension = input_params.input_lwe_dimension;
let lwe_noise_distribution = input_params.lwe_noise_distribution;
let glwe_noise_distribution = squash_params.glwe_noise_distribution;
let ciphertext_modulus = squash_params.ciphertext_modulus;
let ciphertext_modulus_64 = CiphertextModulus::new_native();
let msg_modulus = input_params.message_modulus_log;
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
let encoding_with_padding_64: u64 = get_encoding_with_padding(ciphertext_modulus_64);
let glwe_dimension = squash_params.glwe_dimension;
let polynomial_size = squash_params.polynomial_size;
let decomp_base_log = squash_params.decomp_base_log;
let decomp_level_count = squash_params.decomp_level_count;
let grouping_factor = squash_params.grouping_factor;
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
let mut rsc = TestResources::new();
let f = |x: u128| x % msg_modulus.0 as u128;
let delta = encoding_with_padding / msg_modulus.0 as u128;
let delta_64 = encoding_with_padding_64 / msg_modulus.0 as u64;
let mut msg = msg_modulus.0 as u64;
const NB_TESTS: usize = 10;
let number_of_messages = 1;
let accumulator = generate_programmable_bootstrap_glwe_lut(
polynomial_size,
glwe_dimension.to_glwe_size(),
msg_modulus.0.cast_into(),
ciphertext_modulus,
delta,
f,
);
assert!(check_encrypted_content_respects_mod(
&accumulator,
ciphertext_modulus
));
// Create the LweSecretKey
let small_lwe_sk: LweSecretKeyOwned<u128> = allocate_and_generate_new_binary_lwe_secret_key(
input_lwe_dimension,
&mut rsc.secret_random_generator,
);
let input_lwe_secret_key = LweSecretKey::from_container(
small_lwe_sk
.clone()
.into_container()
.iter()
.copied()
.map(|x| x as u64)
.collect::<Vec<_>>(),
);
let output_glwe_secret_key: GlweSecretKeyOwned<u128> =
allocate_and_generate_new_binary_glwe_secret_key(
glwe_dimension,
polynomial_size,
&mut rsc.secret_random_generator,
);
let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
let output_lwe_dimension = output_lwe_secret_key.lwe_dimension();
let mut bsk = LweMultiBitBootstrapKey::new(
0u128,
glwe_dimension.to_glwe_size(),
polynomial_size,
decomp_base_log,
decomp_level_count,
input_lwe_dimension,
grouping_factor,
ciphertext_modulus,
);
par_generate_lwe_multi_bit_bootstrap_key(
&small_lwe_sk,
&output_glwe_secret_key,
&mut bsk,
glwe_noise_distribution,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&*bsk,
ciphertext_modulus
));
let d_bsk = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(&bsk, &stream);
while msg != 0 {
msg -= 1;
for _ in 0..NB_TESTS {
let plaintext = Plaintext(msg * delta_64);
let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
&input_lwe_secret_key,
plaintext,
lwe_noise_distribution,
ciphertext_modulus_64,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&lwe_ciphertext_in,
ciphertext_modulus_64
));
let d_lwe_ciphertext_in =
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
output_lwe_dimension,
LweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
let mut test_vector_indexes: Vec<u64> = vec![0; number_of_messages];
for (i, ind) in test_vector_indexes.iter_mut().enumerate() {
*ind = <usize as CastInto<u64>>::cast_into(i);
}
let mut d_test_vector_indexes =
unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
let lwe_indexes = lwe_indexes_usize
.iter()
.map(|&x| <usize as CastInto<u64>>::cast_into(x))
.collect_vec();
let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &stream, 0) };
unsafe {
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
}
cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
&d_lwe_ciphertext_in,
&mut d_out_pbs_ct,
&d_accumulator,
&d_test_vector_indexes,
&d_output_indexes,
&d_input_indexes,
&d_bsk,
&stream,
);
let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
assert!(check_encrypted_content_respects_mod(
&out_pbs_ct,
ciphertext_modulus
));
let decrypted = decrypt_lwe_ciphertext(&output_lwe_secret_key, &out_pbs_ct);
let decoded = round_decode(decrypted.0, delta) % msg_modulus.0 as u128;
assert_eq!(decoded, f(msg as u128));
}
}
}
#[test]
fn test_multibit_bootstrap_u128_with_squashing() {
execute_multibit_bootstrap_u128(
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
);
}

View File

@@ -7,11 +7,13 @@ mod glwe_sample_extraction;
mod lwe_keyswitch;
mod lwe_linear_algebra;
mod lwe_multi_bit_programmable_bootstrapping;
mod lwe_multi_bit_programmable_bootstrapping_128;
mod lwe_packing_keyswitch;
mod lwe_programmable_bootstrapping;
mod lwe_programmable_bootstrapping_128;
mod modulus_switch_noise_reduction;
mod noise_distribution;
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
pub lwe_sk: LweSecretKey<Vec<Scalar>>,
pub glwe_sk: GlweSecretKey<Vec<Scalar>>,

View File

@@ -10,9 +10,9 @@ use crate::core_crypto::prelude::{
/// A structure representing a vector of GLWE ciphertexts with 64 bits of precision on the GPU.
#[derive(Debug)]
pub struct CudaLweMultiBitBootstrapKey {
pub struct CudaLweMultiBitBootstrapKey<Scalar: UnsignedInteger> {
// Pointers to GPU data
pub(crate) d_vec: CudaVec<u64>,
pub(crate) d_vec: CudaVec<Scalar>,
// Lwe dimension
pub(crate) input_lwe_dimension: LweDimension,
// Glwe dimension
@@ -27,14 +27,11 @@ pub struct CudaLweMultiBitBootstrapKey {
pub(crate) grouping_factor: LweBskGroupingFactor,
}
impl CudaLweMultiBitBootstrapKey {
pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container>(
impl<Scalar: UnsignedInteger> CudaLweMultiBitBootstrapKey<Scalar> {
pub fn from_lwe_multi_bit_bootstrap_key<InputBskCont: Container<Element = Scalar>>(
bsk: &LweMultiBitBootstrapKey<InputBskCont>,
streams: &CudaStreams,
) -> Self
where
InputBskCont::Element: UnsignedInteger,
{
) -> Self {
let input_lwe_dimension = bsk.input_lwe_dimension();
let polynomial_size = bsk.polynomial_size();
let decomp_level_count = bsk.decomposition_level_count();
@@ -43,7 +40,7 @@ impl CudaLweMultiBitBootstrapKey {
let grouping_factor = bsk.grouping_factor();
// Allocate memory
let mut d_vec = CudaVec::<u64>::new_multi_gpu(
let mut d_vec = CudaVec::<InputBskCont::Element>::new_multi_gpu(
lwe_multi_bit_bootstrap_key_size(
input_lwe_dimension,
glwe_dimension.to_glwe_size(),

View File

@@ -14,9 +14,11 @@ use crate::core_crypto::prelude::{
};
pub use algorithms::*;
pub use entities::*;
use std::any::{Any, TypeId};
use std::ffi::c_void;
use tfhe_cuda_backend::bindings::*;
use tfhe_cuda_backend::cuda_bind::*;
pub struct CudaStreams {
pub ptr: Vec<*mut c_void>,
pub gpu_indexes: Vec<GpuIndex>,
@@ -311,15 +313,18 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
#[allow(clippy::too_many_arguments)]
pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
pub unsafe fn programmable_bootstrap_multi_bit_async<
T: UnsignedInteger,
B: Any + UnsignedInteger,
>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
lwe_array_out: &mut CudaVec<B>,
output_indexes: &CudaVec<T>,
test_vector: &CudaVec<T>,
test_vector: &CudaVec<B>,
test_vector_indexes: &CudaVec<T>,
lwe_array_in: &CudaVec<T>,
input_indexes: &CudaVec<T>,
bootstrapping_key: &CudaVec<u64>,
bootstrapping_key: &CudaVec<B>,
lwe_dimension: LweDimension,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
@@ -331,42 +336,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
let num_many_lut = 1u32;
let lut_stride = 0u32;
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
scratch_cuda_multi_bit_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
if TypeId::of::<B>() == TypeId::of::<u128>() {
scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap_128(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else if TypeId::of::<B>() == TypeId::of::<u64>() {
scratch_cuda_multi_bit_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else {
panic!("Unsupported torus size")
}
}
#[allow(clippy::too_many_arguments)]
@@ -607,9 +653,9 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
#[allow(clippy::too_many_arguments)]
pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: UnsignedInteger>(
pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Any + UnsignedInteger>(
streams: &CudaStreams,
dest: &mut CudaVec<u64>,
dest: &mut CudaVec<T>,
src: &[T],
input_lwe_dim: LweDimension,
glwe_dim: GlweDimension,
@@ -620,17 +666,34 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async<T: Unsigned
let size = std::mem::size_of_val(src);
for (i, &stream_ptr) in streams.ptr.iter().enumerate() {
assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
);
if TypeId::of::<T>() == TypeId::of::<u128>() {
cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
);
} else if TypeId::of::<T>() == TypeId::of::<u64>() {
cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
stream_ptr,
streams.gpu_indexes[i].get(),
dest.as_mut_c_ptr(i as u32),
src.as_ptr().cast(),
input_lwe_dim.0 as u32,
glwe_dim.0 as u32,
l_gadget.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
);
} else {
panic!("Unsupported torus size for bsk conversion")
}
}
}

View File

@@ -30,7 +30,7 @@ pub struct CudaCompressionKey {
}
pub struct CudaDecompressionKey {
pub blind_rotate_key: CudaBootstrappingKey,
pub blind_rotate_key: CudaBootstrappingKey<u64>,
pub lwe_per_glwe: LweCiphertextCount,
pub glwe_dimension: GlweDimension,
pub polynomial_size: PolynomialSize,

View File

@@ -22,9 +22,9 @@ use crate::shortint::{CarryModulus, CiphertextModulus, MessageModulus, PBSOrder}
mod radix;
pub enum CudaBootstrappingKey {
pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
Classic(CudaLweBootstrapKey),
MultiBit(CudaLweMultiBitBootstrapKey),
MultiBit(CudaLweMultiBitBootstrapKey<Scalar>),
}
/// A structure containing the server public key.
@@ -34,7 +34,7 @@ pub enum CudaBootstrappingKey {
// #[derive(PartialEq, Serialize, Deserialize)]
pub struct CudaServerKey {
pub key_switching_key: CudaLweKeyswitchKey<u64>,
pub bootstrapping_key: CudaBootstrappingKey,
pub bootstrapping_key: CudaBootstrappingKey<u64>,
// Size of the message buffer
pub message_modulus: MessageModulus,
// Size of the carry buffer

View File

@@ -1,6 +1,6 @@
use crate::core_crypto::prelude::*;
use crate::shortint::parameters::noise_squashing::{
NoiseSquashingCompressionParameters, NoiseSquashingParameters,
NoiseSquashingCompressionParameters, NoiseSquashingMultiBitParameters, NoiseSquashingParameters,
};
use crate::shortint::parameters::{
CoreCiphertextModulus, ModulusSwitchNoiseReductionParams, ModulusSwitchType,
@@ -69,3 +69,8 @@ pub enum NoiseSquashingParametersVersions {
pub enum NoiseSquashingCompressionParametersVersions {
V0(NoiseSquashingCompressionParameters),
}
#[derive(VersionsDispatch)]
pub enum NoiseSquashingMultiBitParametersVersions {
V0(NoiseSquashingMultiBitParameters),
}

View File

@@ -1,18 +1,19 @@
use std::sync::LazyLock;
use super::atomic_pattern::AtomicPatternParameters;
use crate::keycache::utils::named_params_impl;
use crate::keycache::*;
#[cfg(tarpaulin)]
use crate::shortint::parameters::coverage_parameters::*;
use crate::shortint::parameters::current_params::*;
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
use crate::shortint::parameters::parameters_wopbs::*;
use crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use crate::shortint::parameters::*;
use crate::shortint::wopbs::WopbsKey;
use crate::shortint::{ClientKey, KeySwitchingKey, ServerKey};
use serde::{Deserialize, Serialize};
use super::atomic_pattern::AtomicPatternParameters;
named_params_impl!( ShortintParameterSet =>
V1_3_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
@@ -492,6 +493,10 @@ named_params_impl!( NoiseSquashingParameters =>
V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
);
named_params_impl!( NoiseSquashingMultiBitParameters =>
V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
);
impl From<AtomicPatternParameters> for (ClientKey, ServerKey) {
fn from(param: AtomicPatternParameters) -> Self {
let param_set = ShortintParameterSet::from(param);

View File

@@ -46,8 +46,13 @@ use current_params::multi_bit::tuniform::p_fail_2_minus_64::ks_pbs_gpu::{
};
use current_params::noise_squashing::p_fail_2_minus_128::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use super::current_params::V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use super::current_params::{
V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
use super::NoiseSquashingCompressionParameters;
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
// Aliases
// Compute Gaussian
@@ -125,6 +130,10 @@ pub const NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingParameters =
V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingMultiBitParameters =
V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
// GPU 2^-64
// GPU TUniform
pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M64:

View File

@@ -1,3 +1,4 @@
use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::shortint::backward_compatibility::parameters::noise_squashing::*;
use crate::shortint::parameters::{
CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
@@ -34,3 +35,17 @@ pub struct NoiseSquashingCompressionParameters {
pub carry_modulus: CarryModulus,
pub ciphertext_modulus: CoreCiphertextModulus<u128>,
}
#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize, Versionize)]
#[versionize(NoiseSquashingMultiBitParametersVersions)]
pub struct NoiseSquashingMultiBitParameters {
pub glwe_dimension: GlweDimension,
pub polynomial_size: PolynomialSize,
pub glwe_noise_distribution: DynamicDistribution<u128>,
pub decomp_base_log: DecompositionBaseLog,
pub decomp_level_count: DecompositionLevelCount,
pub grouping_factor: LweBskGroupingFactor,
pub message_modulus: MessageModulus,
pub carry_modulus: CarryModulus,
pub ciphertext_modulus: CoreCiphertextModulus<u128>,
}

View File

@@ -43,6 +43,7 @@ pub use noise_squashing::p_fail_2_minus_128::*;
#[cfg(feature = "hpu")]
pub use hpu::*;
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
use crate::shortint::parameters::{
ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
KeySwitch32PBSParameters, MultiBitPBSParameters, NoiseSquashingCompressionParameters,
@@ -1700,6 +1701,14 @@ pub const VEC_ALL_NOISE_SQUASHING_PARAMETERS: [(&NoiseSquashingParameters, &str)
),
];
pub const VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS: [(
&NoiseSquashingMultiBitParameters,
&str,
); 1] = [(
&V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
"V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128",
)];
/// All [`NoiseSquashingCompressionParameters`] in this module.
pub const VEC_ALL_NOISE_SQUASHING_COMPRESSION_PARAMETERS: [(
&NoiseSquashingCompressionParameters,

View File

@@ -1,6 +1,7 @@
use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters;
use crate::shortint::parameters::{
CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount,
DynamicDistribution, GlweDimension, LweCiphertextCount, MessageModulus,
DynamicDistribution, GlweDimension, LweBskGroupingFactor, LweCiphertextCount, MessageModulus,
ModulusSwitchNoiseReductionParams, ModulusSwitchType, NoiseEstimationMeasureBound,
NoiseSquashingCompressionParameters, NoiseSquashingParameters, PolynomialSize, RSigmaFactor,
Variance,
@@ -58,3 +59,16 @@ pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
carry_modulus: CarryModulus(4),
ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
};
pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingMultiBitParameters = NoiseSquashingMultiBitParameters {
glwe_dimension: GlweDimension(2),
polynomial_size: PolynomialSize(2048),
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
decomp_base_log: DecompositionBaseLog(23),
decomp_level_count: DecompositionLevelCount(3),
grouping_factor: LweBskGroupingFactor(4),
message_modulus: MessageModulus(4),
carry_modulus: CarryModulus(4),
ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
};