diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h index 0e31e2c0c..d6f08067d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h @@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( uint32_t num_many_lut, uint32_t lut_stride); template +uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle( + uint32_t polynomial_size); +template uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( uint32_t polynomial_size); template @@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( template uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size); - + uint32_t polynomial_size, + uint64_t full_sm_keybundle); +template +uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs, + uint32_t polynomial_size, + uint64_t full_sm_keybundle); template struct pbs_buffer { int8_t *d_mem_keybundle = NULL; int8_t *d_mem_acc_step_one = NULL; @@ -281,4 +288,146 @@ template struct pbs_buffer { } }; +template +struct pbs_buffer_128 { + int8_t *d_mem_keybundle = NULL; + int8_t *d_mem_acc_step_one = NULL; + int8_t *d_mem_acc_step_two = NULL; + int8_t *d_mem_acc_cg = NULL; + int8_t *d_mem_acc_tbc = NULL; + uint32_t lwe_chunk_size; + double *keybundle_fft; + __uint128_t *global_accumulator; + double *global_join_buffer; + + PBS_VARIANT pbs_variant; + bool gpu_memory_allocated; + + pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t level_count, uint32_t input_lwe_ciphertext_count, + uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant, + bool allocate_gpu_memory, uint64_t *size_tracker) { + gpu_memory_allocated = allocate_gpu_memory; + cuda_set_device(gpu_index); + + this->pbs_variant = pbs_variant; + this->lwe_chunk_size = lwe_chunk_size; + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + + // default + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle< + __uint128_t>(polynomial_size); + uint64_t full_sm_accumulate_step_one = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one< + __uint128_t>(polynomial_size); + uint64_t full_sm_accumulate_step_two = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two< + __uint128_t>(polynomial_size); + uint64_t partial_sm_accumulate_step_one = + get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< + __uint128_t>(polynomial_size); + // cg + uint64_t full_sm_cg_accumulate = + get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>( + polynomial_size); + uint64_t partial_sm_cg_accumulate = + get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap< + __uint128_t>(polynomial_size); + + auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size * + (glwe_dimension + 1) * (glwe_dimension + 1) * + level_count; + auto num_blocks_acc_step_one = + level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count; + auto num_blocks_acc_step_two = + input_lwe_ciphertext_count * (glwe_dimension + 1); + auto num_blocks_acc_cg = + level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count; + + // Keybundle + if (max_shared_memory < full_sm_keybundle) + d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async( + num_blocks_keybundle * full_sm_keybundle, stream, gpu_index, + size_tracker, allocate_gpu_memory); + + switch (pbs_variant) { + case PBS_VARIANT::CG: + // Accumulator CG + if (max_shared_memory < partial_sm_cg_accumulate) + d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async( + num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index, + size_tracker, allocate_gpu_memory); + else if (max_shared_memory < full_sm_cg_accumulate) + d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async( + num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index, + size_tracker, allocate_gpu_memory); + break; + case PBS_VARIANT::DEFAULT: + // Accumulator step one + if (max_shared_memory < partial_sm_accumulate_step_one) + d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async( + num_blocks_acc_step_one * full_sm_accumulate_step_one, stream, + gpu_index, size_tracker, allocate_gpu_memory); + else if (max_shared_memory < full_sm_accumulate_step_one) + d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async( + num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream, + gpu_index, size_tracker, allocate_gpu_memory); + + // Accumulator step two + if (max_shared_memory < full_sm_accumulate_step_two) + d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async( + num_blocks_acc_step_two * full_sm_accumulate_step_two, stream, + gpu_index, size_tracker, allocate_gpu_memory); + break; + default: + PANIC("Cuda error (PBS): unsupported implementation variant.") + } + + keybundle_fft = (double *)cuda_malloc_with_size_tracking_async( + num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double), + stream, gpu_index, size_tracker, allocate_gpu_memory); + global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async( + input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size * + sizeof(__uint128_t), + stream, gpu_index, size_tracker, allocate_gpu_memory); + global_join_buffer = (double *)cuda_malloc_with_size_tracking_async( + level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count * + (polynomial_size / 2) * 4 * sizeof(double), + stream, gpu_index, size_tracker, allocate_gpu_memory); + } + + void release(cudaStream_t stream, uint32_t gpu_index) { + + if (d_mem_keybundle) + cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index, + gpu_memory_allocated); + switch (pbs_variant) { + case DEFAULT: + if (d_mem_acc_step_one) + cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream, + gpu_index, gpu_memory_allocated); + if (d_mem_acc_step_two) + cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream, + gpu_index, gpu_memory_allocated); + break; + case CG: + if (d_mem_acc_cg) + cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index, + gpu_memory_allocated); + break; + default: + PANIC("Cuda error (PBS): unsupported implementation variant.") + } + + cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index, + gpu_memory_allocated); + cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index, + gpu_memory_allocated); + cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index, + gpu_memory_allocated); + } +}; + #endif // CUDA_MULTI_BIT_UTILITIES_H diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h index e9db8c86e..4b35c8acd 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h @@ -240,7 +240,10 @@ template struct pbs_buffer { } }; -template struct pbs_buffer_128 { +template struct pbs_buffer_128; + +template +struct pbs_buffer_128 { int8_t *d_mem; __uint128_t *global_accumulator; diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h index fcca1ce5e..68f2e2719 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h @@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size, uint32_t grouping_factor); +void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128( + void *stream, uint32_t gpu_index, void *dest, void const *src, + uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size, uint32_t grouping_factor); + uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, @@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); + +uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64( + void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); + +void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, + uint32_t lut_stride); + +void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream, + const uint32_t gpu_index, + int8_t **buffer); } #endif // CUDA_MULTI_BIT_H diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt index 5fec699fb..f0924d411 100644 --- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt +++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt @@ -1,5 +1,6 @@ file(GLOB_RECURSE SOURCES "*.cu") -add_library(tfhe_cuda_backend STATIC ${SOURCES}) +add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh + pbs/programmable_bootstrap_multibit_128.cu) set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX) target_include_directories(tfhe_cuda_backend PRIVATE .) diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh index 6080a5ffc..9d0205197 100644 --- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh @@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double(uint64_t x, r = __ll2double_rn(x); } +template <> +__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x, + double &r) { + // We truncate x + r = __ll2double_rn(static_cast(x)); +} + template __device__ inline T init_decomposer_state(T input, uint32_t base_log, uint32_t level_count) { diff --git a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh index ab6ad0484..15bbd03c6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh @@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus( } } +// params is expected to be full degree not half degree +// same as convert_u128_to_f128_as_torus() but expects input to be on registers +template +__device__ void convert_u128_on_regs_to_f128_as_torus( + double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo, + const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) { + + const double normalization = pow(2., -128.); + Index tid = threadIdx.x; + // #pragma unroll + for (Index i = 0; i < params::opt / 2; i++) { + auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]); + auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]); + + out_re_hi[tid] = out_re.hi * normalization; + out_re_lo[tid] = out_re.lo * normalization; + out_im_hi[tid] = out_im.hi * normalization; + out_im_lo[tid] = out_im.lo * normalization; + + tid += params::degree / params::opt; + } +} + template __device__ void convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im, @@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo, } // params is expected to be full degree not half degree -// converts standqard input into complex<128> represented by 4 double +// converts standard input into complex<128> represented by 4 double // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1, // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n, im_lo_0, im_lo_1, ..., im_lo_n] template @@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo, } // params is expected to be full degree not half degree -// converts standqard input into complex<128> represented by 4 double +// converts standard input into complex<128> represented by 4 double // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1, // re_lo_1, im_hi_1, im_lo_1, // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ] diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu index ef6e651a7..b00182131 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu @@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( static_cast(stream), gpu_index); } +void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128( + void *stream, uint32_t gpu_index, void *dest, void const *src, + uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count, + uint32_t polynomial_size, uint32_t grouping_factor) { + uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * + level_count * (1 << grouping_factor) / + grouping_factor; + size_t buffer_size = + total_polynomials * polynomial_size * sizeof(__uint128_t); + + cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size, + static_cast(stream), gpu_index); +} + // We need these lines so the compiler knows how to specialize these functions template __device__ const uint64_t * get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level, @@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k, int glwe_dimension, uint32_t level_count); +template __device__ const __uint128_t * +get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i, + int k, int level, + uint32_t grouping_factor, + uint32_t polynomial_size, + uint32_t glwe_dimension, + uint32_t level_count); + template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block( const uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 3137b0f87..17235618b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer, __syncthreads(); } +/** Perform the matrix multiplication between the GGSW and the GLWE, + * each block operating on a single level for mask and body. + * Both operands should be at fourier domain + * + * This function assumes: + * - Thread blocks at dimension z relates to the decomposition level. + * - Thread blocks at dimension y relates to the glwe dimension. + * - polynomial_size / params::opt threads are available per block + */ +template +__device__ void mul_ggsw_glwe_in_fourier_domain_128( + double *fft, double *join_buffer, + const double *__restrict__ bootstrapping_key, int iteration, G &group, + bool support_dsm = false) { + const uint32_t polynomial_size = params::degree; + const uint32_t glwe_dimension = gridDim.y - 1; + const uint32_t level_count = gridDim.z; + + // The first product is used to initialize level_join_buffer + auto this_block_rank = get_this_block_rank(group, support_dsm); + + // Continues multiplying fft by every polynomial in that particular bsk level + // Each y-block accumulates in a different polynomial at each iteration + auto bsk_slice = get_ith_mask_kth_block_128( + bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size, + glwe_dimension, level_count); + for (int j = 0; j < glwe_dimension + 1; j++) { + int idx = (j + this_block_rank) % (glwe_dimension + 1); + + auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4; + auto buffer_slice = get_join_buffer_element_128( + blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension, + support_dsm); + + polynomial_product_accumulate_in_fourier_domain_128( + buffer_slice, fft, bsk_poly, j == 0); + group.sync(); + } + + // ----------------------------------------------------------------- + // All blocks are synchronized here; after this sync, level_join_buffer has + // the values needed from every other block + + // accumulate rest of the products into fft buffer + for (int l = 0; l < level_count; l++) { + auto cur_src_acc = get_join_buffer_element_128( + l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension, + support_dsm); + + polynomial_accumulate_in_fourier_domain_128(fft, cur_src_acc, + l == 0); + } + + __syncthreads(); +} + template void execute_pbs_async( cudaStream_t const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 57e37b424..e9362114b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -280,8 +280,9 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap( check_cuda_error(cudaGetLastError()); } - auto lwe_chunk_size = get_lwe_chunk_size( - gpu_index, input_lwe_ciphertext_count, polynomial_size); + auto lwe_chunk_size = + get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, + polynomial_size, full_sm_keybundle); uint64_t size_tracker = 0; *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh index 73770f39e..dc3068cc8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh @@ -18,62 +18,6 @@ #include "programmable_bootstrap.cuh" #include "types/complex/operations.cuh" -/** Perform the matrix multiplication between the GGSW and the GLWE, - * each block operating on a single level for mask and body. - * Both operands should be at fourier domain - * - * This function assumes: - * - Thread blocks at dimension z relates to the decomposition level. - * - Thread blocks at dimension y relates to the glwe dimension. - * - polynomial_size / params::opt threads are available per block - */ -template -__device__ void mul_ggsw_glwe_in_fourier_domain_128( - double *fft, double *join_buffer, - const double *__restrict__ bootstrapping_key, int iteration, G &group, - bool support_dsm = false) { - const uint32_t polynomial_size = params::degree; - const uint32_t glwe_dimension = gridDim.y - 1; - const uint32_t level_count = gridDim.z; - - // The first product is used to initialize level_join_buffer - auto this_block_rank = get_this_block_rank(group, support_dsm); - - // Continues multiplying fft by every polynomial in that particular bsk level - // Each y-block accumulates in a different polynomial at each iteration - auto bsk_slice = get_ith_mask_kth_block_128( - bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size, - glwe_dimension, level_count); - for (int j = 0; j < glwe_dimension + 1; j++) { - int idx = (j + this_block_rank) % (glwe_dimension + 1); - - auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4; - auto buffer_slice = get_join_buffer_element_128( - blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension, - support_dsm); - - polynomial_product_accumulate_in_fourier_domain_128( - buffer_slice, fft, bsk_poly, j == 0); - group.sync(); - } - - // ----------------------------------------------------------------- - // All blocks are synchronized here; after this sync, level_join_buffer has - // the values needed from every other block - - // accumulate rest of the products into fft buffer - for (int l = 0; l < level_count; l++) { - auto cur_src_acc = get_join_buffer_element_128( - l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension, - support_dsm); - - polynomial_accumulate_in_fourier_domain_128(fft, cur_src_acc, - l == 0); - } - - __syncthreads(); -} - template __global__ void __launch_bounds__(params::degree / params::opt) @@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt) accumulator); gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z); - // We are using the same memory space for accumulator_fft and - // accumulator_rotated, so we need to synchronize here to make sure they - // don't modify the same memory space at the same time // Switch to the FFT space auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2; auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2; diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index a1d1dc5af..ef4506ed9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, */ template uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size) { - - uint64_t full_sm_keybundle = - get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( - polynomial_size); + uint32_t polynomial_size, + uint64_t full_sm_keybundle) { int max_blocks_per_sm; auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index f80ca5f16..44d31fa84 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -521,8 +521,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap( check_cuda_error(cudaGetLastError()); } - auto lwe_chunk_size = get_lwe_chunk_size( - gpu_index, input_lwe_ciphertext_count, polynomial_size); + auto lwe_chunk_size = + get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, + polynomial_size, full_sm_keybundle); uint64_t size_tracker = 0; *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu new file mode 100644 index 000000000..3297f0761 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu @@ -0,0 +1,361 @@ +#include "programmable_bootstrap_cg_multibit.cuh" +#include "programmable_bootstrap_multibit_128.cuh" + +template +uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128( + void *stream, uint32_t gpu_index, + pbs_buffer_128 **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + + switch (polynomial_size) { + case 256: + return scratch_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 512: + return scratch_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 1024: + return scratch_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 2048: + return scratch_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 4096: + return scratch_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + default: + PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " + "N's are powers of two" + " in the interval [256..4096].") + } +} + +template +uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128( + void *stream, uint32_t gpu_index, + pbs_buffer_128 **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + + switch (polynomial_size) { + case 256: + return scratch_cg_multi_bit_programmable_bootstrap_128< + InputTorus, AmortizedDegree<256>>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 512: + return scratch_cg_multi_bit_programmable_bootstrap_128< + InputTorus, AmortizedDegree<512>>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 1024: + return scratch_cg_multi_bit_programmable_bootstrap_128< + InputTorus, AmortizedDegree<1024>>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 2048: + return scratch_cg_multi_bit_programmable_bootstrap_128< + InputTorus, AmortizedDegree<2048>>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + case 4096: + return scratch_cg_multi_bit_programmable_bootstrap_128< + InputTorus, AmortizedDegree<4096>>( + static_cast(stream), gpu_index, buffer, glwe_dimension, + polynomial_size, level_count, input_lwe_ciphertext_count, + allocate_gpu_memory); + default: + PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " + "N's are powers of two" + " in the interval [256..4096].") + } +} + +uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64( + void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + + bool supports_cg = + supports_cooperative_groups_on_multibit_programmable_bootstrap< + __uint128_t>(glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, + cuda_get_max_shared_memory(gpu_index)); + + if (supports_cg) + return scratch_cuda_cg_multi_bit_programmable_bootstrap_128( + stream, gpu_index, + reinterpret_cast **>(buffer), + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, allocate_gpu_memory); + else + return scratch_cuda_multi_bit_programmable_bootstrap_128( + stream, gpu_index, + reinterpret_cast **>(buffer), + glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, allocate_gpu_memory); +} + +template +void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out, + InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, + InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + pbs_buffer_128 *pbs_buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride) { + + switch (polynomial_size) { + case 256: + host_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 512: + host_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 1024: + host_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 2048: + host_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 4096: + host_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + default: + PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " + "N's are powers of two" + " in the interval [256..4096].") + } +} + +template +void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out, + InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, + InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + pbs_buffer_128 *pbs_buffer, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride) { + + switch (polynomial_size) { + case 256: + host_cg_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 512: + host_cg_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 1024: + host_cg_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 2048: + host_cg_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + case 4096: + host_cg_multi_bit_programmable_bootstrap_128>( + static_cast(stream), gpu_index, lwe_array_out, + lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, + lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + num_samples, num_many_lut, lut_stride); + break; + default: + PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " + "N's are powers of two" + " in the interval [256..4096].") + } +} + +void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + void *stream, uint32_t gpu_index, void *lwe_array_out, + void const *lwe_output_indexes, void const *lut_vector, + void const *lut_vector_indexes, void const *lwe_array_in, + void const *lwe_input_indexes, void const *bootstrapping_key, + int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, + uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, + uint32_t lut_stride) { + + if (base_log > 64) + PANIC("Cuda error (multi-bit PBS): base log should be <= 64") + + auto *buffer = + reinterpret_cast *>(mem_ptr); + switch (buffer->pbs_variant) { + case PBS_VARIANT::CG: + cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128< + uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, + lwe_dimension, glwe_dimension, polynomial_size, + grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); + break; + case PBS_VARIANT::DEFAULT: + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out), + static_cast(lwe_output_indexes), + static_cast(lut_vector), + static_cast(lut_vector_indexes), + static_cast(lwe_array_in), + static_cast(lwe_input_indexes), + static_cast(bootstrapping_key), buffer, + lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, + base_log, level_count, num_samples, num_many_lut, lut_stride); + break; + default: + PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") + } +} + +void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream, + const uint32_t gpu_index, + int8_t **buffer) { + const auto x = + reinterpret_cast *>(*buffer); + x->release(static_cast(stream), gpu_index); +} + +/** + * Computes divisors of the product of num_sms (streaming multiprocessors on the + * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch + * device_multi_bit_programmable_bootstrap_keybundle) smaller than its square + * root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first + * suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs) + * - 13) for additional logic. + * + * The value 13 was empirically determined based on memory requirements for + * benchmarking on an RTX 4090 GPU, balancing performance and resource use. + */ +template +uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs, + uint32_t polynomial_size, + uint64_t full_sm_keybundle) { + + int max_blocks_per_sm; + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + cuda_set_device(gpu_index); + if (max_shared_memory < full_sm_keybundle) + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks_per_sm, + device_multi_bit_programmable_bootstrap_keybundle_128, + polynomial_size / params::opt, full_sm_keybundle); + else + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks_per_sm, + device_multi_bit_programmable_bootstrap_keybundle_128, + polynomial_size / params::opt, 0); + + int num_sms = 0; + check_cuda_error(cudaDeviceGetAttribute( + &num_sms, cudaDevAttrMultiProcessorCount, gpu_index)); + + int x = num_sms * max_blocks_per_sm; + int count = 0; + + int divisor = 1; + int ith_divisor = 0; + +#if CUDA_ARCH < 900 + // We pick a smaller divisor on GPUs other than H100, so 256-bit integer + // multiplication can run + int log2_max_num_pbs = log2_int(max_num_pbs); + if (log2_max_num_pbs > 13) + ith_divisor = log2_max_num_pbs - 11; +#endif + + for (int i = sqrt(x); i >= 1; i--) { + if (x % i == 0) { + if (count == ith_divisor) { + divisor = i; + break; + } else { + count++; + } + } + } + + return divisor; +} diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh new file mode 100644 index 000000000..ca2f2c604 --- /dev/null +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh @@ -0,0 +1,1101 @@ +#ifndef PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH +#define PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH + +#ifdef __CDT_PARSER__ +#undef __CUDA_RUNTIME_H__ +#include +#endif + +#include "fft128/fft128.cuh" +#include "pbs/pbs_multibit_utilities.h" +#include "programmable_bootstrap_multibit.cuh" +#include "utils/helper.cuh" + +template +uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle( + uint32_t polynomial_size) { + return sizeof(__uint128_t) * polynomial_size * 2; // accumulator +} + +template +__global__ void device_multi_bit_programmable_bootstrap_keybundle_128( + const InputTorus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_input_indexes, double *keybundle_array, + const __uint128_t *__restrict__ bootstrapping_key, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size, + uint32_t keybundle_size_per_input, int8_t *device_mem, + uint64_t device_memory_size_per_block) { + + extern __shared__ int8_t sharedmem[]; + int8_t *selected_memory; + + if constexpr (SMD == FULLSM) { + selected_memory = sharedmem; + } else { + int block_index = blockIdx.x + blockIdx.y * gridDim.x + + blockIdx.z * gridDim.x * gridDim.y; + selected_memory = &device_mem[block_index * device_memory_size_per_block]; + } + + // Ids + uint32_t level_id = blockIdx.z; + uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1); + uint32_t poly_id = blockIdx.y % (glwe_dimension + 1); + uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset); + uint32_t input_idx = blockIdx.x / lwe_chunk_size; + + if (lwe_iteration < (lwe_dimension / grouping_factor)) { + + auto block_lwe_array_in = + &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)]; + + auto keybundle = &keybundle_array[ + // select the input + input_idx * keybundle_size_per_input]; + + //////////////////////////////////////////////////////////// + // Computes all keybundles + uint32_t rev_lwe_iteration = + (lwe_dimension / grouping_factor) - lwe_iteration - 1; + + // //////////////////////////////// + // Keygen guarantees the first term is a constant term of the polynomial, no + // polynomial multiplication required + auto bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block( + bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id, + grouping_factor, 2 * polynomial_size, glwe_dimension, level_count); + auto bsk_poly_ini = bsk_slice + poly_id * params::degree; + + __uint128_t reg_acc[params::opt]; + + copy_polynomial_in_regs<__uint128_t, params::opt, + params::degree / params::opt>(bsk_poly_ini, + reg_acc); + + int offset = + get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count); + + // Precalculate the monomial degrees and store them in shared memory + uint32_t *monomial_degrees = (uint32_t *)selected_memory; + if (threadIdx.x < (1 << grouping_factor)) { + auto lwe_array_group = + block_lwe_array_in + rev_lwe_iteration * grouping_factor; + monomial_degrees[threadIdx.x] = + calculates_monomial_degree( + lwe_array_group, threadIdx.x, grouping_factor); + } + __syncthreads(); + + // Accumulate the other terms + for (int g = 1; g < (1 << grouping_factor); g++) { + uint32_t monomial_degree = monomial_degrees[g]; + + auto bsk_poly = bsk_poly_ini + g * offset; + // Multiply by the bsk element + polynomial_accumulate_monic_monomial_mul_on_regs<__uint128_t, params>( + reg_acc, bsk_poly, monomial_degree); + } + __syncthreads(); // needed because we are going to reuse the + // shared memory for the fft + + // Move from local memory back to shared memory but as complex + double *fft = (double *)selected_memory; + + auto fft_re_hi = &fft[0 * params::degree / 2]; + auto fft_re_lo = &fft[1 * params::degree / 2]; + auto fft_im_hi = &fft[2 * params::degree / 2]; + auto fft_im_lo = &fft[3 * params::degree / 2]; + + convert_u128_on_regs_to_f128_as_torus( + fft_re_hi, fft_re_lo, fft_im_hi, fft_im_lo, ®_acc[0], + ®_acc[params::opt / 2]); + + __syncthreads(); // TODO: Do we need this sync? + + negacyclic_forward_fft_f128>(fft_re_hi, fft_re_lo, + fft_im_hi, fft_im_lo); + + // lwe iteration + auto keybundle_out = get_ith_mask_kth_block_128( + keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id, + polynomial_size, glwe_dimension, level_count); + auto keybundle_poly = keybundle_out + poly_id * (params::degree / 2) * 4; + + copy_polynomial( + fft, keybundle_poly); + } +} + +////////////////////////////////////////////////////////////////// +/////////////////// START DEFAULT /////////////////////////////// +template +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_accumulate_step_one_128( + const InputTorus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_input_indexes, + const __uint128_t *__restrict__ lut_vector, + const InputTorus *__restrict__ lut_vector_indexes, + __uint128_t *global_accumulator, double *global_accumulator_fft, + uint32_t lwe_dimension, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + int8_t *device_mem, uint64_t device_memory_size_per_block) { + + // We use shared memory for the polynomials that are used often during the + // bootstrap, since shared memory is kept in L1 cache and accessing it is + // much faster than global memory + extern __shared__ int8_t sharedmem[]; + + int8_t *selected_memory = sharedmem; + + if constexpr (SMD == FULLSM) { + selected_memory = sharedmem; + } else { + int block_index = blockIdx.z + blockIdx.y * gridDim.z + + blockIdx.x * gridDim.z * gridDim.y; + selected_memory = &device_mem[block_index * device_memory_size_per_block]; + } + + auto accumulator = reinterpret_cast<__uint128_t *>(selected_memory); + auto accumulator_fft = + reinterpret_cast(accumulator) + + static_cast(sizeof(__uint128_t) * polynomial_size / + sizeof(double)); + + if constexpr (SMD == PARTIALSM) + accumulator_fft = reinterpret_cast(sharedmem); + + auto block_lwe_array_in = + &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; + + auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] * + params::degree * (glwe_dimension + 1)]; + + auto global_slice = + &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * + params::degree]; + + auto global_fft_slice = + &global_accumulator_fft[(blockIdx.y + blockIdx.z * (glwe_dimension + 1) + + blockIdx.x * level_count * + (glwe_dimension + 1)) * + (params::degree / 2) * 4]; + + if constexpr (is_first_iter) { + // First iteration + //////////////////////////////////////////////////////////// + // Initializes the accumulator with the body of LWE + // Put "b" in [0, 2N[ + InputTorus b_hat = 0; + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); + + divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt, + params::degree / params::opt>( + accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, + false); + + // Persist + copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>( + accumulator, global_slice); + } else { + // Load the accumulator calculated in previous iterations + copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>( + global_slice, accumulator); + } + + // Perform a rounding to increase the accuracy of the + // bootstrapped ciphertext + init_decomposer_state_inplace<__uint128_t, params::opt, + params::degree / params::opt>( + accumulator, base_log, level_count); + + // Decompose the accumulator. Each block gets one level of the + // decomposition, for the mask and the body (so block 0 will have the + // accumulator decomposed at level 0, 1 at 1, etc.) + GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count, + accumulator); + gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z); + + // Switch to the FFT space + auto acc_fft_re_hi = &accumulator_fft[0 * params::degree / 2]; + auto acc_fft_re_lo = &accumulator_fft[1 * params::degree / 2]; + auto acc_fft_im_hi = &accumulator_fft[2 * params::degree / 2]; + auto acc_fft_im_lo = &accumulator_fft[3 * params::degree / 2]; + + negacyclic_forward_fft_f128>(acc_fft_re_hi, acc_fft_re_lo, + acc_fft_im_hi, acc_fft_im_lo); + + copy_polynomial( + accumulator_fft, global_fft_slice); +} +template +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_accumulate_step_two_128( + __uint128_t *lwe_array_out, + const InputTorus *__restrict__ lwe_output_indexes, + const double *__restrict__ keybundle_array, + __uint128_t *global_accumulator, double *global_accumulator_fft, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t iteration, uint32_t lwe_chunk_size, int8_t *device_mem, + uint64_t device_memory_size_per_block, uint32_t num_many_lut, + uint32_t lut_stride) { + // We use shared memory for the polynomials that are used often during the + // bootstrap, since shared memory is kept in L1 cache and accessing it is + // much faster than global memory + extern __shared__ int8_t sharedmem[]; + int8_t *selected_memory; + + selected_memory = sharedmem; + + if constexpr (SMD == FULLSM) { + selected_memory = sharedmem; + } else { + int block_index = blockIdx.x + blockIdx.y * gridDim.x + + blockIdx.z * gridDim.x * gridDim.y; + selected_memory = &device_mem[block_index * device_memory_size_per_block]; + } + + auto accumulator_fft = reinterpret_cast(selected_memory); + + auto keybundle = + &keybundle_array[blockIdx.x * lwe_chunk_size * level_count * + (glwe_dimension + 1) * (glwe_dimension + 1) * + (params::degree / 2) * 4]; + + auto global_accumulator_fft_chunk = + &global_accumulator_fft[blockIdx.x * level_count * (glwe_dimension + 1) * + (params::degree / 2) * 4]; + + for (int level = 0; level < level_count; level++) { + auto global_fft_slice = + &global_accumulator_fft_chunk[level * (glwe_dimension + 1) * + (params::degree / 2) * 4]; + + for (int j = 0; j < (glwe_dimension + 1); j++) { + auto fft = &global_fft_slice[j * (params::degree / 2) * 4]; + + // Get the bootstrapping key piece necessary for the multiplication + // It is already in the Fourier domain + auto bsk_slice = get_ith_mask_kth_block_128(keybundle, iteration, j, + level, params::degree, + glwe_dimension, level_count); + auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2 * 4]; + + polynomial_product_accumulate_in_fourier_domain_128( + accumulator_fft, fft, bsk_poly, !level && !j); + } + } + + // Perform the inverse FFT on the result of the GGSW x GLWE and add to the + // accumulator + auto acc_fft_re_hi = &accumulator_fft[0 * params::degree / 2]; + auto acc_fft_re_lo = &accumulator_fft[1 * params::degree / 2]; + auto acc_fft_im_hi = &accumulator_fft[2 * params::degree / 2]; + auto acc_fft_im_lo = &accumulator_fft[3 * params::degree / 2]; + + negacyclic_backward_fft_f128>( + acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo); + auto global_slice = + &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * + params::degree]; + + add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo, + acc_fft_im_hi, acc_fft_im_lo, + global_slice, true); + __syncthreads(); + + if constexpr (is_last_iter) { + // Last iteration + auto block_lwe_array_out = + &lwe_array_out[lwe_output_indexes[blockIdx.x] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + if (blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, + // but we do the computation at block 0 to avoid waiting for extra blocks, + // in case they're not synchronized + sample_extract_mask<__uint128_t, params>(block_lwe_array_out, + global_slice); + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.x] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_mask<__uint128_t, params>( + next_block_lwe_array_out, global_slice, 1, i * lut_stride); + } + } + } else if (blockIdx.y == glwe_dimension) { + sample_extract_body<__uint128_t, params>(block_lwe_array_out, + global_slice, 0); + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { + + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.x] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_body<__uint128_t, params>( + next_block_lwe_array_out, global_slice, 0, i * lut_stride); + } + } + } + } +} + +////////////////////////////////////////////////////////////////// +/////////////////// END DEFAULT ////////////////////////////////// + +////////////////////////////////////////////////////////////////// +/////////////////// START CG ///////////////////////////////////// + +template +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_cg_accumulate_128( + __uint128_t *lwe_array_out, + const InputTorus *__restrict__ lwe_output_indexes, + const __uint128_t *__restrict__ lut_vector, + const InputTorus *__restrict__ lut_vector_indexes, + const InputTorus *__restrict__ lwe_array_in, + const InputTorus *__restrict__ lwe_input_indexes, + const double *__restrict__ keybundle_array, double *join_buffer, + __uint128_t *global_accumulator, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, + uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, + int8_t *device_mem, uint64_t device_memory_size_per_block, + uint32_t num_many_lut, uint32_t lut_stride) { + + grid_group grid = this_grid(); + + // We use shared memory for the polynomials that are used often during the + // bootstrap, since shared memory is kept in L1 cache and accessing it is + // much faster than global memory + extern __shared__ int8_t sharedmem[]; + int8_t *selected_memory; + + if constexpr (SMD == FULLSM) { + selected_memory = sharedmem; + } else { + int block_index = blockIdx.z + blockIdx.y * gridDim.z + + blockIdx.x * gridDim.z * gridDim.y; + selected_memory = &device_mem[block_index * device_memory_size_per_block]; + } + + auto accumulator_rotated = reinterpret_cast<__uint128_t *>(selected_memory); + auto accumulator_fft = + reinterpret_cast(accumulator_rotated) + + static_cast(sizeof(__uint128_t) * polynomial_size / + sizeof(double)); + + if constexpr (SMD == PARTIALSM) + accumulator_fft = reinterpret_cast(sharedmem); + + // The third dimension of the block is used to determine on which ciphertext + // this block is operating, in the case of batch bootstraps + auto block_lwe_array_in = + &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; + + auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] * + params::degree * (glwe_dimension + 1)]; + + auto block_join_buffer = + &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) * + (params::degree / 2) * 4]; + + auto global_accumulator_slice = + &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * + params::degree]; + + auto keybundle = &keybundle_array[blockIdx.x * keybundle_size_per_input]; + + if (lwe_offset == 0) { + // Put "b" in [0, 2N[ + InputTorus b_hat = 0; + modulus_switch(block_lwe_array_in[lwe_dimension], b_hat, + params::log2_degree + 1); + + divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt, + params::degree / params::opt>( + accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree], + b_hat, false); + } else { + // Load the accumulator_rotated calculated in previous iterations + copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>( + global_accumulator_slice, accumulator_rotated); + } + + for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) { + // Perform a rounding to increase the accuracy of the + // bootstrapped ciphertext + init_decomposer_state_inplace<__uint128_t, params::opt, + params::degree / params::opt>( + accumulator_rotated, base_log, level_count); + + // Decompose the accumulator_rotated. Each block gets one level of the + // decomposition, for the mask and the body (so block 0 will have the + // accumulator_rotated decomposed at level 0, 1 at 1, etc.) + GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count, + accumulator_rotated); + gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z); + + // Switch to the FFT space + auto acc_fft_re_hi = &accumulator_fft[0 * params::degree / 2]; + auto acc_fft_re_lo = &accumulator_fft[1 * params::degree / 2]; + auto acc_fft_im_hi = &accumulator_fft[2 * params::degree / 2]; + auto acc_fft_im_lo = &accumulator_fft[3 * params::degree / 2]; + + negacyclic_forward_fft_f128>( + acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo); + + __syncthreads(); + + // Perform G^-1(ACC) * GGSW -> GLWE + mul_ggsw_glwe_in_fourier_domain_128( + accumulator_fft, block_join_buffer, keybundle, i, grid); + + negacyclic_backward_fft_f128>( + acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo); + __syncthreads(); + + add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo, + acc_fft_im_hi, acc_fft_im_lo, + accumulator_rotated, true); + } + + auto accumulator = accumulator_rotated; + + if (blockIdx.z == 0) { + if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) { + auto block_lwe_array_out = + &lwe_array_out[lwe_output_indexes[blockIdx.x] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + if (blockIdx.y < glwe_dimension) { + // Perform a sample extract. At this point, all blocks have the result, + // but we do the computation at block 0 to avoid waiting for extra + // blocks, in case they're not synchronized Always extract one by + // default + sample_extract_mask<__uint128_t, params>(block_lwe_array_out, + accumulator); + + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.x] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_mask<__uint128_t, params>( + next_block_lwe_array_out, accumulator, 1, i * lut_stride); + } + } + + } else if (blockIdx.y == glwe_dimension) { + + sample_extract_body<__uint128_t, params>(block_lwe_array_out, + accumulator, 0); + + if (num_many_lut > 1) { + for (int i = 1; i < num_many_lut; i++) { + + auto next_lwe_array_out = + lwe_array_out + + (i * gridDim.x * (glwe_dimension * polynomial_size + 1)); + auto next_block_lwe_array_out = + &next_lwe_array_out[lwe_output_indexes[blockIdx.x] * + (glwe_dimension * polynomial_size + 1) + + blockIdx.y * polynomial_size]; + + sample_extract_body<__uint128_t, params>( + next_block_lwe_array_out, accumulator, 0, i * lut_stride); + } + } + } + } else { + // Load the accumulator calculated in previous iterations + copy_polynomial<__uint128_t, params::opt, params::degree / params::opt>( + accumulator, global_accumulator_slice); + } + } +} + +////////////////////////////////////////////////////////////////// +/////////////////// END CG /////////////////////////////////////// + +template +__host__ void execute_compute_keybundle_128( + cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + pbs_buffer_128 *buffer, uint32_t num_samples, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) { + cuda_set_device(gpu_index); + + auto lwe_chunk_size = buffer->lwe_chunk_size; + uint32_t chunk_size = + std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); + + uint32_t keybundle_size_per_input = + lwe_chunk_size * level_count * (glwe_dimension + 1) * + (glwe_dimension + 1) * (polynomial_size / 2) * 4; + + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle< + __uint128_t>(polynomial_size); + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + + auto d_mem = buffer->d_mem_keybundle; + auto keybundle_fft = buffer->keybundle_fft; + + // Compute a keybundle + dim3 grid_keybundle(num_samples * chunk_size, + (glwe_dimension + 1) * (glwe_dimension + 1), level_count); + dim3 thds(polynomial_size / params::opt, 1, 1); + if (max_shared_memory < full_sm_keybundle) + device_multi_bit_programmable_bootstrap_keybundle_128 + <<>>( + lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, + lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, + level_count, lwe_offset, chunk_size, keybundle_size_per_input, + d_mem, full_sm_keybundle); + else + device_multi_bit_programmable_bootstrap_keybundle_128 + <<>>( + lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key, + lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, + level_count, lwe_offset, chunk_size, keybundle_size_per_input, + d_mem, 0); + check_cuda_error(cudaGetLastError()); +} + +template +__host__ void execute_step_one_128( + cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector, + InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, + pbs_buffer_128 *buffer, uint32_t num_samples, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count) { + cuda_set_device(gpu_index); + + uint64_t full_sm_accumulate_step_one = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one< + __uint128_t>(polynomial_size); + uint64_t partial_sm_accumulate_step_one = + get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< + __uint128_t>(polynomial_size); + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + + // + auto d_mem = buffer->d_mem_acc_step_one; + auto global_accumulator = buffer->global_accumulator; + auto global_accumulator_fft = buffer->global_join_buffer; + + dim3 grid_accumulate_step_one(num_samples, glwe_dimension + 1, level_count); + dim3 thds(polynomial_size / params::opt, 1, 1); + + if (max_shared_memory < partial_sm_accumulate_step_one) + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, NOSM, is_first_iter> + <<>>( + lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, + global_accumulator, global_accumulator_fft, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count, d_mem, + full_sm_accumulate_step_one); + else if (max_shared_memory < full_sm_accumulate_step_one) + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, PARTIALSM, is_first_iter> + <<>>(lwe_array_in, lwe_input_indexes, lut_vector, + lut_vector_indexes, global_accumulator, + global_accumulator_fft, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, d_mem, + partial_sm_accumulate_step_one); + else + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, FULLSM, is_first_iter> + <<>>(lwe_array_in, lwe_input_indexes, lut_vector, + lut_vector_indexes, global_accumulator, + global_accumulator_fft, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, d_mem, 0); + check_cuda_error(cudaGetLastError()); +} + +template +__host__ void execute_step_two_128( + cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out, + InputTorus const *lwe_output_indexes, + pbs_buffer_128 *buffer, uint32_t num_samples, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t j, uint32_t num_many_lut, uint32_t lut_stride) { + cuda_set_device(gpu_index); + + auto lwe_chunk_size = buffer->lwe_chunk_size; + uint64_t full_sm_accumulate_step_two = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two< + __uint128_t>(polynomial_size); + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + + auto d_mem = buffer->d_mem_acc_step_two; + auto keybundle_fft = buffer->keybundle_fft; + auto global_accumulator = buffer->global_accumulator; + auto global_accumulator_fft = buffer->global_join_buffer; + + dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1); + dim3 thds(polynomial_size / params::opt, 1, 1); + + if (max_shared_memory < full_sm_accumulate_step_two) + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, NOSM, is_last_iter> + <<>>( + lwe_array_out, lwe_output_indexes, keybundle_fft, + global_accumulator, global_accumulator_fft, glwe_dimension, + polynomial_size, level_count, j, lwe_chunk_size, d_mem, + full_sm_accumulate_step_two, num_many_lut, lut_stride); + else + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, FULLSM, is_last_iter> + <<>>(lwe_array_out, lwe_output_indexes, keybundle_fft, + global_accumulator, global_accumulator_fft, glwe_dimension, + polynomial_size, level_count, j, lwe_chunk_size, d_mem, 0, + num_many_lut, lut_stride); + check_cuda_error(cudaGetLastError()); +} + +/* + * Host wrapper to the multi-bit programmable bootstrap 128 + */ +template +__host__ void host_multi_bit_programmable_bootstrap_128( + cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out, + InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, + InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + pbs_buffer_128 *buffer, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride) { + + auto lwe_chunk_size = buffer->lwe_chunk_size; + + for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); + lwe_offset += lwe_chunk_size) { + + // Compute a keybundle + execute_compute_keybundle_128( + stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, + grouping_factor, level_count, lwe_offset); + // Accumulate + uint32_t chunk_size = std::min( + lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); + for (uint32_t j = 0; j < chunk_size; j++) { + bool is_first_iter = (j + lwe_offset) == 0; + bool is_last_iter = + (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor); + if (is_first_iter) { + execute_step_one_128( + stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, buffer, num_samples, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count); + } else { + execute_step_one_128( + stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, buffer, num_samples, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count); + } + + if (is_last_iter) { + execute_step_two_128( + stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, + num_samples, glwe_dimension, polynomial_size, level_count, j, + num_many_lut, lut_stride); + } else { + execute_step_two_128( + stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, + num_samples, glwe_dimension, polynomial_size, level_count, j, + num_many_lut, lut_stride); + } + } + } +} + +template +__host__ void execute_cg_external_product_loop_128( + cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector, + InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, __uint128_t *lwe_array_out, + InputTorus const *lwe_output_indexes, + pbs_buffer_128 *buffer, uint32_t num_samples, + uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, + uint32_t lwe_offset, uint32_t num_many_lut, uint32_t lut_stride) { + cuda_set_device(gpu_index); + + const uint64_t full_sm = + get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>( + polynomial_size); + const uint64_t partial_sm = + get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap< + __uint128_t>(polynomial_size); + + auto full_dm = full_sm; + auto partial_dm = full_sm - partial_sm; + uint64_t no_dm = 0; + + auto lwe_chunk_size = buffer->lwe_chunk_size; + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + + uint32_t keybundle_size_per_input = + lwe_chunk_size * level_count * (glwe_dimension + 1) * + (glwe_dimension + 1) * (polynomial_size / 2) * 4; + + uint32_t chunk_size = + std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); + + auto d_mem = buffer->d_mem_acc_cg; + auto keybundle_fft = buffer->keybundle_fft; + auto global_accumulator = buffer->global_accumulator; + auto join_buffer = buffer->global_join_buffer; + + void *kernel_args[22]; + kernel_args[0] = &lwe_array_out; + kernel_args[1] = &lwe_output_indexes; + kernel_args[2] = &lut_vector; + kernel_args[3] = &lut_vector_indexes; + kernel_args[4] = &lwe_array_in; + kernel_args[5] = &lwe_input_indexes; + kernel_args[6] = &keybundle_fft; + kernel_args[7] = &join_buffer; + kernel_args[8] = &global_accumulator; + kernel_args[9] = &lwe_dimension; + kernel_args[10] = &glwe_dimension; + kernel_args[11] = &polynomial_size; + kernel_args[12] = &base_log; + kernel_args[13] = &level_count; + kernel_args[14] = &grouping_factor; + kernel_args[15] = &lwe_offset; + kernel_args[16] = &chunk_size; + kernel_args[17] = &keybundle_size_per_input; + kernel_args[18] = &d_mem; + kernel_args[20] = &num_many_lut; + kernel_args[21] = &lut_stride; + + dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count); + dim3 thds(polynomial_size / params::opt, 1, 1); + + if (max_shared_memory < partial_dm) { + kernel_args[19] = &full_dm; + check_cuda_error(cudaLaunchCooperativeKernel( + (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, NOSM>, + grid_accumulate, thds, (void **)kernel_args, 0, stream)); + } else if (max_shared_memory < full_dm) { + kernel_args[19] = &partial_dm; + check_cuda_error(cudaLaunchCooperativeKernel( + (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, PARTIALSM>, + grid_accumulate, thds, (void **)kernel_args, partial_sm, stream)); + } else { + kernel_args[19] = &no_dm; + check_cuda_error(cudaLaunchCooperativeKernel( + (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, FULLSM>, + grid_accumulate, thds, (void **)kernel_args, full_sm, stream)); + } +} + +template +__host__ void host_cg_multi_bit_programmable_bootstrap_128( + cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out, + InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, + InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, + InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + pbs_buffer_128 *buffer, uint32_t glwe_dimension, + uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride) { + + auto lwe_chunk_size = buffer->lwe_chunk_size; + + for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); + lwe_offset += lwe_chunk_size) { + + // Compute a keybundle + execute_compute_keybundle_128( + stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, + buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, + grouping_factor, level_count, lwe_offset); + + // Accumulate + execute_cg_external_product_loop_128( + stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, + lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, + num_samples, lwe_dimension, glwe_dimension, polynomial_size, + grouping_factor, base_log, level_count, lwe_offset, num_many_lut, + lut_stride); + } +} + +template +__host__ uint64_t scratch_multi_bit_programmable_bootstrap_128( + cudaStream_t stream, uint32_t gpu_index, + pbs_buffer_128 **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + + cuda_set_device(gpu_index); + + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle< + __uint128_t>(polynomial_size); + uint64_t full_sm_accumulate_step_one = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one< + __uint128_t>(polynomial_size); + uint64_t full_sm_accumulate_step_two = + get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two< + __uint128_t>(polynomial_size); + uint64_t partial_sm_accumulate_step_one = + get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< + __uint128_t>(polynomial_size); + + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + if (max_shared_memory < full_sm_keybundle) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncCachePreferShared)); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncCachePreferShared)); + } + + if (max_shared_memory < partial_sm_accumulate_step_one) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, NOSM, false>, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, NOSM, false>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, NOSM, true>, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, NOSM, true>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory < full_sm_accumulate_step_one) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, PARTIALSM, false>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + partial_sm_accumulate_step_one)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, PARTIALSM, false>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, PARTIALSM, true>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + partial_sm_accumulate_step_one)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, PARTIALSM, true>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, FULLSM, false>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_accumulate_step_one)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, FULLSM, false>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, FULLSM, true>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_accumulate_step_one)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_one_128< + InputTorus, params, FULLSM, true>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaGetLastError()); + } + + if (max_shared_memory < full_sm_accumulate_step_two) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, NOSM, false>, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, NOSM, false>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, NOSM, true>, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, NOSM, true>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, FULLSM, false>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_accumulate_step_two)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, FULLSM, false>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, FULLSM, true>, + cudaFuncAttributeMaxDynamicSharedMemorySize, + full_sm_accumulate_step_two)); + check_cuda_error(cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_accumulate_step_two_128< + InputTorus, params, FULLSM, true>, + cudaFuncCachePreferShared)); + check_cuda_error(cudaGetLastError()); + } + + auto lwe_chunk_size = get_lwe_chunk_size_128( + gpu_index, input_lwe_ciphertext_count, polynomial_size, + full_sm_keybundle); + uint64_t size_tracker = 0; + *buffer = new pbs_buffer_128( + stream, gpu_index, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT, + allocate_gpu_memory, &size_tracker); + return size_tracker; +} + +template +__host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128( + cudaStream_t stream, uint32_t gpu_index, + pbs_buffer_128 **buffer, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t level_count, + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + + cuda_set_device(gpu_index); + + uint64_t full_sm_keybundle = + get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle< + __uint128_t>(polynomial_size); + uint64_t full_sm_cg_accumulate = + get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>( + polynomial_size); + uint64_t partial_sm_cg_accumulate = + get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap< + __uint128_t>(polynomial_size); + + auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + if (max_shared_memory < full_sm_keybundle) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_keybundle_128, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + if (max_shared_memory < partial_sm_cg_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_cg_accumulate_128, + cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_cg_accumulate_128, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else if (max_shared_memory < full_sm_cg_accumulate) { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, PARTIALSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, PARTIALSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } else { + check_cuda_error(cudaFuncSetAttribute( + device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, FULLSM>, + cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate)); + cudaFuncSetCacheConfig( + device_multi_bit_programmable_bootstrap_cg_accumulate_128< + InputTorus, params, FULLSM>, + cudaFuncCachePreferShared); + check_cuda_error(cudaGetLastError()); + } + + auto lwe_chunk_size = get_lwe_chunk_size_128( + gpu_index, input_lwe_ciphertext_count, polynomial_size, + full_sm_keybundle); + uint64_t size_tracker = 0; + *buffer = new pbs_buffer_128( + stream, gpu_index, glwe_dimension, polynomial_size, level_count, + input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG, + allocate_gpu_memory, &size_tracker); + return size_tracker; +} + +#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index ea64a2889..582a0f1cb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -283,8 +283,9 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap( check_cuda_error(cudaGetLastError()); } - auto lwe_chunk_size = get_lwe_chunk_size( - gpu_index, input_lwe_ciphertext_count, polynomial_size); + auto lwe_chunk_size = + get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, + polynomial_size, full_sm_keybundle); uint64_t size_tracker = 0; *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh index 040d8631c..1e45784f7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh @@ -5,15 +5,15 @@ #include #include -template inline __device__ const char *get_format(); +template __device__ inline const char *get_format(); -template <> inline __device__ const char *get_format() { return "%d, "; } +template <> __device__ inline const char *get_format() { return "%d, "; } -template <> inline __device__ const char *get_format() { +template <> __device__ inline const char *get_format() { return "%u, "; } -template <> inline __device__ const char *get_format() { +template <> __device__ inline const char *get_format() { return "%lu, "; } @@ -23,6 +23,15 @@ template __global__ void print_debug_kernel(const T *src, int N) { } } +template <> +__global__ inline void print_debug_kernel(const __uint128_t *src, int N) { + for (int i = 0; i < N; i++) { + uint64_t low = static_cast(src[i]); + uint64_t high = static_cast(src[i] >> 64); + printf("(%llu, %llu), ", high, low); + } +} + template <> __global__ inline void print_debug_kernel(const double2 *src, int N) { for (int i = 0; i < N; i++) { diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 2041d9235..27816bfa6 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -2188,6 +2188,19 @@ unsafe extern "C" { grouping_factor: u32, ); } +unsafe extern "C" { + pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128( + stream: *mut ffi::c_void, + gpu_index: u32, + dest: *mut ffi::c_void, + src: *const ffi::c_void, + input_lwe_dim: u32, + glwe_dim: u32, + level_count: u32, + polynomial_size: u32, + grouping_factor: u32, + ); +} unsafe extern "C" { pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( stream: *mut ffi::c_void, @@ -2230,3 +2243,45 @@ unsafe extern "C" { pbs_buffer: *mut *mut i8, ); } +unsafe extern "C" { + pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64( + stream: *mut ffi::c_void, + gpu_index: u32, + buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + stream: *mut ffi::c_void, + gpu_index: u32, + lwe_array_out: *mut ffi::c_void, + lwe_output_indexes: *const ffi::c_void, + lut_vector: *const ffi::c_void, + lut_vector_indexes: *const ffi::c_void, + lwe_array_in: *const ffi::c_void, + lwe_input_indexes: *const ffi::c_void, + bootstrapping_key: *const ffi::c_void, + mem_ptr: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + grouping_factor: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + num_many_lut: u32, + lut_stride: u32, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128( + stream: *mut ffi::c_void, + gpu_index: u32, + buffer: *mut *mut i8, + ); +} diff --git a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs index 1b09ee34c..d55fb3b9c 100644 --- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs +++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs @@ -165,7 +165,7 @@ fn pbs_128(c: &mut Criterion) { mod cuda { use benchmark::utilities::{ cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads, - write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, + write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes, CudaLocalKeys, OperatorType, }; use criterion::{black_box, Criterion, Throughput}; @@ -173,12 +173,14 @@ mod cuda { use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use tfhe::core_crypto::gpu::{ + cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext, cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams, }; use tfhe::core_crypto::prelude::*; use tfhe::shortint::engine::ShortintEngine; use tfhe::shortint::parameters::{ ModulusSwitchType, NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, }; use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey; @@ -441,14 +443,281 @@ mod cuda { ); } + fn cuda_multi_bit_pbs_128(c: &mut Criterion) { + let bench_name = "core_crypto::cuda::multi_bit_pbs128"; + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); + + type Scalar = u128; + let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + let squash_params = + NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + + let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46); + let ct_modulus_u64: CiphertextModulus = CiphertextModulus::new_native(); + + let params_name = "PARAMS_SWITCH_SQUASH"; + + let mut boxed_seeder = new_seeder(); + let seeder = boxed_seeder.as_mut(); + + let mut secret_generator = + SecretRandomGenerator::::new(seeder.seed()); + + let mut encryption_generator = + EncryptionRandomGenerator::::new(seeder.seed(), seeder); + + let input_lwe_secret_key = + LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator); + + let output_glwe_secret_key = GlweSecretKey::>::generate_new_binary( + squash_params.glwe_dimension, + squash_params.polynomial_size, + &mut secret_generator, + ); + + let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key(); + + let multi_bit_bsk = LweMultiBitBootstrapKey::new( + Scalar::ZERO, + squash_params.glwe_dimension.to_glwe_size(), + squash_params.polynomial_size, + squash_params.decomp_base_log, + squash_params.decomp_level_count, + input_params.lwe_dimension, + squash_params.grouping_factor, + squash_params.ciphertext_modulus, + ); + + let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new() + .multi_bit_bootstrap_key(multi_bit_bsk) + .build(); + + let message_modulus: u64 = 1 << 4; + let input_message: u64 = 3; + let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus; + let plaintext = Plaintext(input_message * delta); + + let bench_id; + + match get_bench_type() { + BenchmarkType::Latency => { + let streams = CudaStreams::new_multi_gpu(); + let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams); + + let lwe_ciphertext_in: LweCiphertextOwned = + allocate_and_encrypt_new_lwe_ciphertext( + &input_lwe_secret_key, + plaintext, + lwe_noise_distribution_u64, + ct_modulus_u64, + &mut encryption_generator, + ); + let lwe_ciphertext_in_gpu = + CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams); + + let accumulator: GlweCiphertextOwned = GlweCiphertextOwned::new( + Scalar::ONE, + squash_params.glwe_dimension.to_glwe_size(), + squash_params.polynomial_size, + squash_params.ciphertext_modulus, + ); + let accumulator_gpu = + CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams); + + let out_pbs_ct = LweCiphertext::new( + Scalar::ZERO, + output_lwe_secret_key.lwe_dimension().to_lwe_size(), + squash_params.ciphertext_modulus, + ); + let mut out_pbs_ct_gpu = + CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams); + + let h_indexes = [0]; + let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0); + + bench_id = format!("{bench_name}::{params_name}"); + { + bench_group.bench_function(&bench_id, |b| { + b.iter(|| { + cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext( + &lwe_ciphertext_in_gpu, + &mut out_pbs_ct_gpu, + &accumulator_gpu, + &cuda_indexes.d_lut, + &cuda_indexes.d_output, + &cuda_indexes.d_input, + gpu_keys.multi_bit_bsk.as_ref().unwrap(), + &streams, + ); + black_box(&mut out_pbs_ct_gpu); + }) + }); + } + } + BenchmarkType::Throughput => { + let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None); + let gpu_count = get_number_of_gpus() as usize; + + bench_id = format!("{bench_name}::throughput::{params_name}"); + let blocks: usize = 1; + let elements = throughput_num_threads(blocks, 1); + let elements_per_stream = elements as usize / gpu_count; + bench_group.throughput(Throughput::Elements(elements)); + bench_group.bench_function(&bench_id, |b| { + let setup_encrypted_values = || { + let local_streams = cuda_local_streams_core(); + + let plaintext_list = + PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream)); + + let input_cts = (0..gpu_count) + .map(|i| { + let mut input_ct_list = LweCiphertextList::new( + u64::ZERO, + input_lwe_secret_key.lwe_dimension().to_lwe_size(), + LweCiphertextCount(elements_per_stream), + ct_modulus_u64, + ); + + encrypt_lwe_ciphertext_list( + &input_lwe_secret_key, + &mut input_ct_list, + &plaintext_list, + lwe_noise_distribution_u64, + &mut encryption_generator, + ); + + CudaLweCiphertextList::from_lwe_ciphertext_list( + &input_ct_list, + &local_streams[i], + ) + }) + .collect::>(); + + let accumulators = (0..gpu_count) + .map(|i| { + let accumulator = GlweCiphertextOwned::new( + Scalar::ONE, + squash_params.glwe_dimension.to_glwe_size(), + squash_params.polynomial_size, + squash_params.ciphertext_modulus, + ); + CudaGlweCiphertextList::from_glwe_ciphertext( + &accumulator, + &local_streams[i], + ) + }) + .collect::>(); + + // Allocate the LweCiphertext to store the result of the PBS + let output_cts = (0..gpu_count) + .map(|i| { + let output_ct_list = LweCiphertextList::new( + Scalar::ZERO, + output_lwe_secret_key.lwe_dimension().to_lwe_size(), + LweCiphertextCount(elements_per_stream), + squash_params.ciphertext_modulus, + ); + CudaLweCiphertextList::from_lwe_ciphertext_list( + &output_ct_list, + &local_streams[i], + ) + }) + .collect::>(); + + let h_indexes = (0..(elements / gpu_count as u64)) + .map(CastFrom::cast_from) + .collect::>(); + let cuda_indexes_vec = (0..gpu_count) + .map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0)) + .collect::>(); + local_streams.iter().for_each(|stream| stream.synchronize()); + + ( + input_cts, + output_cts, + accumulators, + cuda_indexes_vec, + local_streams, + ) + }; + + b.iter_batched( + setup_encrypted_values, + |( + input_cts, + mut output_cts, + accumulators, + cuda_indexes_vec, + local_streams, + )| { + (0..gpu_count) + .into_par_iter() + .zip(input_cts.par_iter()) + .zip(output_cts.par_iter_mut()) + .zip(accumulators.par_iter()) + .zip(local_streams.par_iter()) + .for_each( + |((((i, input_ct), output_ct), accumulator), local_stream)| { + cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext( + input_ct, + output_ct, + accumulator, + &cuda_indexes_vec[i].d_lut, + &cuda_indexes_vec[i].d_output, + &cuda_indexes_vec[i].d_input, + gpu_keys_vec[i].multi_bit_bsk.as_ref().unwrap(), + local_stream, + ); + }, + ) + }, + criterion::BatchSize::SmallInput, + ); + }); + } + }; + + let params_record = CryptoParametersRecord { + lwe_dimension: Some(input_params.lwe_dimension), + glwe_dimension: Some(squash_params.glwe_dimension), + polynomial_size: Some(squash_params.polynomial_size), + lwe_noise_distribution: Some(lwe_noise_distribution_u64), + glwe_noise_distribution: Some(input_params.glwe_noise_distribution), + pbs_base_log: Some(squash_params.decomp_base_log), + pbs_level: Some(squash_params.decomp_level_count), + ciphertext_modulus: Some(input_params.ciphertext_modulus), + ..Default::default() + }; + + let bit_size = (message_modulus as u32).ilog2(); + write_to_json( + &bench_id, + params_record, + params_name, + "pbs", + &OperatorType::Atomic, + bit_size, + vec![bit_size], + ); + } + pub fn cuda_pbs128_group() { let mut criterion: Criterion<_> = Criterion::default().configure_from_args(); cuda_pbs_128(&mut criterion); } + + pub fn cuda_multi_bit_pbs128_group() { + let mut criterion: Criterion<_> = Criterion::default().configure_from_args(); + cuda_multi_bit_pbs_128(&mut criterion); + } } #[cfg(feature = "gpu")] -use cuda::cuda_pbs128_group; +use cuda::{cuda_multi_bit_pbs128_group, cuda_pbs128_group}; pub fn pbs128_group() { let mut criterion: Criterion<_> = Criterion::default().configure_from_args(); @@ -458,6 +727,7 @@ pub fn pbs128_group() { #[cfg(feature = "gpu")] fn go_through_gpu_bench_groups() { cuda_pbs128_group(); + cuda_multi_bit_pbs128_group(); } #[cfg(not(feature = "gpu"))] diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs index 4330e07ad..873740311 100644 --- a/tfhe-benchmark/src/utilities.rs +++ b/tfhe-benchmark/src/utilities.rs @@ -521,7 +521,7 @@ mod cuda_utils { pub ksk: Option>, pub pksk: Option>, pub bsk: Option, - pub multi_bit_bsk: Option, + pub multi_bit_bsk: Option>, } #[allow(dead_code)] diff --git a/tfhe/examples/utilities/params_to_file.rs b/tfhe/examples/utilities/params_to_file.rs index 8fa1b9fa9..9cfdd4347 100644 --- a/tfhe/examples/utilities/params_to_file.rs +++ b/tfhe/examples/utilities/params_to_file.rs @@ -13,6 +13,8 @@ use tfhe::shortint::parameters::current_params::{ VEC_ALL_COMPRESSION_PARAMETERS, VEC_ALL_HPU_PARAMETERS, VEC_ALL_KS32_PARAMETERS, VEC_ALL_MULTI_BIT_PBS_PARAMETERS, VEC_ALL_NOISE_SQUASHING_PARAMETERS, }; +use tfhe::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters; +use tfhe::shortint::parameters::v1_3::VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS; use tfhe::shortint::parameters::{ CompactPublicKeyEncryptionParameters, CompressionParameters, NoiseSquashingParameters, }; @@ -214,6 +216,36 @@ impl ParamDetails for NoiseSquashingParameters { } } +impl ParamDetails for NoiseSquashingMultiBitParameters { + fn lwe_dimension(&self) -> LweDimension { + panic!("lwe_dimension not applicable for NoiseSquashingMultiBitParameters") + } + + fn glwe_dimension(&self) -> GlweDimension { + self.glwe_dimension + } + + fn lwe_noise_distribution(&self) -> DynamicDistribution { + panic!("lwe_noise_distribution not applicable for NoiseSquashingMultiBitParameters") + } + + fn glwe_noise_distribution(&self) -> DynamicDistribution { + self.glwe_noise_distribution + } + + fn polynomial_size(&self) -> PolynomialSize { + self.polynomial_size + } + + fn lwe_ciphertext_modulus(&self) -> ParamModulus { + panic!("lwe_ciphertext_modulus not applicable for NoiseSquashingMultiBitParameters") + } + + fn glwe_ciphertext_modulus(&self) -> ParamModulus { + ParamModulus::from_ciphertext_modulus(self.ciphertext_modulus) + } +} + #[derive(Eq, PartialEq, Hash)] enum ParametersFormat { Lwe, @@ -493,6 +525,16 @@ fn main() { ParametersFormat::Glwe, ); + let noise_squasing_multi_bit_params: Vec<_> = VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS + .into_iter() + .map(|p| (*p.0, Some(p.1))) + .collect(); + write_all_params_in_file( + "shortint_noise_squashing_multi_bit_parameters_lattice_estimator.sage", + &noise_squasing_multi_bit_params, + ParametersFormat::Glwe, + ); + let ks32_params: Vec<_> = VEC_ALL_KS32_PARAMETERS .into_iter() .map(|p| (AtomicPatternParameters::from(*p.0), Some(p.1))) diff --git a/tfhe/src/core_crypto/algorithms/test/mod.rs b/tfhe/src/core_crypto/algorithms/test/mod.rs index 5df7246d6..6df987a47 100644 --- a/tfhe/src/core_crypto/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/algorithms/test/mod.rs @@ -178,6 +178,36 @@ pub const DUMMY_31_U32: ClassicTestParams = ClassicTestParams { ciphertext_modulus: CiphertextModulus::new(1 << 31), }; +#[cfg(feature = "gpu")] +pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: + NoiseSquashingMultiBitTestParameters = NoiseSquashingMultiBitTestParameters { + glwe_dimension: GlweDimension(2), + polynomial_size: PolynomialSize(2048), + glwe_noise_distribution: DynamicDistribution::new_t_uniform(30), + decomp_base_log: DecompositionBaseLog(23), + decomp_level_count: DecompositionLevelCount(3), + grouping_factor: LweBskGroupingFactor(4), + message_modulus_log: MessageModulusLog(4), + ciphertext_modulus: CiphertextModulus::::new_native(), +}; + +#[cfg(feature = "gpu")] +pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: MultiBitTestParams< + u64, +> = MultiBitTestParams { + input_lwe_dimension: LweDimension(920), + lwe_noise_distribution: DynamicDistribution::new_t_uniform(45), + decomp_base_log: DecompositionBaseLog(22), + decomp_level_count: DecompositionLevelCount(1), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + glwe_noise_distribution: DynamicDistribution::new_t_uniform(17), + message_modulus_log: MessageModulusLog(4), + ciphertext_modulus: CiphertextModulus::new_native(), + grouping_factor: LweBskGroupingFactor(4), + thread_count: ThreadCount(5), +}; + pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams = MultiBitTestParams { input_lwe_dimension: LweDimension(818), lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( diff --git a/tfhe/src/core_crypto/algorithms/test/params.rs b/tfhe/src/core_crypto/algorithms/test/params.rs index cbdb90223..89e18632f 100644 --- a/tfhe/src/core_crypto/algorithms/test/params.rs +++ b/tfhe/src/core_crypto/algorithms/test/params.rs @@ -3,6 +3,8 @@ use crate::core_crypto::entities::*; use crate::core_crypto::prelude::{CastFrom, CastInto, UnsignedInteger}; use crate::keycache::NamedParam; #[cfg(feature = "gpu")] +use crate::shortint::parameters::ModulusSwitchNoiseReductionParams; +#[cfg(feature = "gpu")] use crate::shortint::parameters::ModulusSwitchType; use serde::{Deserialize, Serialize}; @@ -22,6 +24,13 @@ pub struct MultiBitBootstrapKeys { pub fbsk: FourierLweMultiBitBootstrapKeyOwned, } +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct MultiBitStdBootstrapKeys { + pub small_lwe_sk: LweSecretKey>, + pub big_lwe_sk: LweSecretKey>, + pub bsk: LweMultiBitBootstrapKeyOwned, +} + // Fourier key is generated afterward in order to use generic test function #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct FftBootstrapKeys { @@ -81,6 +90,18 @@ pub struct MultiBitTestParams { pub thread_count: ThreadCount, } +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] +pub struct NoiseSquashingMultiBitTestParameters { + pub glwe_dimension: GlweDimension, + pub polynomial_size: PolynomialSize, + pub glwe_noise_distribution: DynamicDistribution, + pub decomp_base_log: DecompositionBaseLog, + pub decomp_level_count: DecompositionLevelCount, + pub grouping_factor: LweBskGroupingFactor, + pub message_modulus_log: MessageModulusLog, + pub ciphertext_modulus: CiphertextModulus, +} + // PartialEq is implemented manually because thread_count doesn't affect key generation and we want // to change its value in test without the need of regenerating keys in the key cache. impl PartialEq for MultiBitTestParams { @@ -141,6 +162,21 @@ pub struct NoiseSquashingTestParams { pub modulus_switch_noise_reduction_params: ModulusSwitchType, pub ciphertext_modulus: CiphertextModulus, } +// Parameters to test NoiseSquashing implementation +#[cfg(feature = "gpu")] +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub struct NoiseSquashingMultiBitTestParams { + pub lwe_dimension: LweDimension, + pub glwe_dimension: GlweDimension, + pub polynomial_size: PolynomialSize, + pub lwe_noise_distribution: DynamicDistribution, + pub glwe_noise_distribution: DynamicDistribution, + pub pbs_base_log: DecompositionBaseLog, + pub pbs_level: DecompositionLevelCount, + pub grouping_factor: LweBskGroupingFactor, + pub modulus_switch_noise_reduction_params: Option, + pub ciphertext_modulus: CiphertextModulus, +} #[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct PackingKeySwitchTestParams { diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs index 775cce447..326ad192c 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs @@ -17,7 +17,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async lut_indexes: &CudaVec, output_indexes: &CudaVec, input_indexes: &CudaVec, - multi_bit_bsk: &CudaLweMultiBitBootstrapKey, + multi_bit_bsk: &CudaLweMultiBitBootstrapKey, streams: &CudaStreams, ) where // CastInto required for PBS modulus switch which returns a usize @@ -151,7 +151,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext( lut_indexes: &CudaVec, output_indexes: &CudaVec, input_indexes: &CudaVec, - multi_bit_bsk: &CudaLweMultiBitBootstrapKey, + multi_bit_bsk: &CudaLweMultiBitBootstrapKey, streams: &CudaStreams, ) where // CastInto required for PBS modulus switch which returns a usize @@ -171,3 +171,162 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext( } streams.synchronize(); } + +/// # Safety +/// +/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not +/// be dropped until streams is synchronised +#[allow(clippy::too_many_arguments)] +pub unsafe fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async( + input: &CudaLweCiphertextList, + output: &mut CudaLweCiphertextList, + accumulator: &CudaGlweCiphertextList, + lut_indexes: &CudaVec, + output_indexes: &CudaVec, + input_indexes: &CudaVec, + multi_bit_bsk: &CudaLweMultiBitBootstrapKey, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + OutputScalar: UnsignedTorus + CastInto, +{ + assert_eq!( + input.lwe_dimension(), + multi_bit_bsk.input_lwe_dimension(), + "Mismatched input LweDimension. LweCiphertext input LweDimension {:?}. \ + FourierLweMultiBitBootstrapKey input LweDimension {:?}.", + input.lwe_dimension(), + multi_bit_bsk.input_lwe_dimension(), + ); + + assert_eq!( + output.lwe_dimension(), + multi_bit_bsk.output_lwe_dimension(), + "Mismatched output LweDimension. LweCiphertext output LweDimension {:?}. \ + FourierLweMultiBitBootstrapKey output LweDimension {:?}.", + output.lwe_dimension(), + multi_bit_bsk.output_lwe_dimension(), + ); + + assert_eq!( + accumulator.glwe_dimension(), + multi_bit_bsk.glwe_dimension(), + "Mismatched GlweSize. Accumulator GlweSize {:?}. \ + FourierLweMultiBitBootstrapKey GlweSize {:?}.", + accumulator.glwe_dimension(), + multi_bit_bsk.glwe_dimension(), + ); + + assert_eq!( + accumulator.polynomial_size(), + multi_bit_bsk.polynomial_size(), + "Mismatched PolynomialSize. Accumulator PolynomialSize {:?}. \ + FourierLweMultiBitBootstrapKey PolynomialSize {:?}.", + accumulator.polynomial_size(), + multi_bit_bsk.polynomial_size(), + ); + + assert_eq!( + output.ciphertext_modulus(), + accumulator.ciphertext_modulus(), + "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})", + input.ciphertext_modulus(), + accumulator.ciphertext_modulus(), + ); + assert_eq!( + streams.gpu_indexes[0], + multi_bit_bsk.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}", + streams.gpu_indexes[0].get(), + multi_bit_bsk.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + input.0.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first input pointer is on GPU {}", + streams.gpu_indexes[0].get(), + input.0.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + output.0.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", + streams.gpu_indexes[0].get(), + output.0.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + accumulator.0.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first accumulator pointer is on GPU {}", + streams.gpu_indexes[0].get(), + accumulator.0.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + input_indexes.gpu_index(0), + "GPU error: first stream is on GPU {}, first input indexes pointer is on GPU {}", + streams.gpu_indexes[0].get(), + input_indexes.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + output_indexes.gpu_index(0), + "GPU error: first stream is on GPU {}, first output indexes pointer is on GPU {}", + streams.gpu_indexes[0].get(), + output_indexes.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + lut_indexes.gpu_index(0), + "GPU error: first stream is on GPU {}, first lut indexes pointer is on GPU {}", + streams.gpu_indexes[0].get(), + lut_indexes.gpu_index(0).get(), + ); + + programmable_bootstrap_multi_bit_async( + streams, + &mut output.0.d_vec, + output_indexes, + &accumulator.0.d_vec, + lut_indexes, + &input.0.d_vec, + input_indexes, + &multi_bit_bsk.d_vec, + input.lwe_dimension(), + multi_bit_bsk.glwe_dimension(), + multi_bit_bsk.polynomial_size(), + multi_bit_bsk.decomp_base_log(), + multi_bit_bsk.decomp_level_count(), + multi_bit_bsk.grouping_factor(), + input.lwe_ciphertext_count().0 as u32, + ); +} + +#[allow(clippy::too_many_arguments)] +pub fn cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext( + input: &CudaLweCiphertextList, + output: &mut CudaLweCiphertextList, + accumulator: &CudaGlweCiphertextList, + lut_indexes: &CudaVec, + output_indexes: &CudaVec, + input_indexes: &CudaVec, + multi_bit_bsk: &CudaLweMultiBitBootstrapKey, + streams: &CudaStreams, +) where + // CastInto required for PBS modulus switch which returns a usize + Scalar: UnsignedTorus + CastInto, +{ + unsafe { + cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext_async( + input, + output, + accumulator, + lut_indexes, + output_indexes, + input_indexes, + multi_bit_bsk, + streams, + ); + } + streams.synchronize(); +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs new file mode 100644 index 000000000..e329d8d5c --- /dev/null +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs @@ -0,0 +1,187 @@ +use super::*; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey; +use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex}; +use crate::core_crypto::gpu::{ + cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext, CudaStreams, +}; +use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod; +use itertools::Itertools; + +fn execute_multibit_bootstrap_u128( + squash_params: NoiseSquashingMultiBitTestParameters, + input_params: MultiBitTestParams, +) { + let input_lwe_dimension = input_params.input_lwe_dimension; + let lwe_noise_distribution = input_params.lwe_noise_distribution; + let glwe_noise_distribution = squash_params.glwe_noise_distribution; + let ciphertext_modulus = squash_params.ciphertext_modulus; + let ciphertext_modulus_64 = CiphertextModulus::new_native(); + let msg_modulus = input_params.message_modulus_log; + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + let encoding_with_padding_64: u64 = get_encoding_with_padding(ciphertext_modulus_64); + let glwe_dimension = squash_params.glwe_dimension; + let polynomial_size = squash_params.polynomial_size; + let decomp_base_log = squash_params.decomp_base_log; + let decomp_level_count = squash_params.decomp_level_count; + let grouping_factor = squash_params.grouping_factor; + + let gpu_index = 0; + let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let mut rsc = TestResources::new(); + + let f = |x: u128| x % msg_modulus.0 as u128; + + let delta = encoding_with_padding / msg_modulus.0 as u128; + let delta_64 = encoding_with_padding_64 / msg_modulus.0 as u64; + let mut msg = msg_modulus.0 as u64; + const NB_TESTS: usize = 10; + let number_of_messages = 1; + + let accumulator = generate_programmable_bootstrap_glwe_lut( + polynomial_size, + glwe_dimension.to_glwe_size(), + msg_modulus.0.cast_into(), + ciphertext_modulus, + delta, + f, + ); + + assert!(check_encrypted_content_respects_mod( + &accumulator, + ciphertext_modulus + )); + + // Create the LweSecretKey + let small_lwe_sk: LweSecretKeyOwned = allocate_and_generate_new_binary_lwe_secret_key( + input_lwe_dimension, + &mut rsc.secret_random_generator, + ); + let input_lwe_secret_key = LweSecretKey::from_container( + small_lwe_sk + .clone() + .into_container() + .iter() + .copied() + .map(|x| x as u64) + .collect::>(), + ); + let output_glwe_secret_key: GlweSecretKeyOwned = + allocate_and_generate_new_binary_glwe_secret_key( + glwe_dimension, + polynomial_size, + &mut rsc.secret_random_generator, + ); + let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key(); + let output_lwe_dimension = output_lwe_secret_key.lwe_dimension(); + + let mut bsk = LweMultiBitBootstrapKey::new( + 0u128, + glwe_dimension.to_glwe_size(), + polynomial_size, + decomp_base_log, + decomp_level_count, + input_lwe_dimension, + grouping_factor, + ciphertext_modulus, + ); + + par_generate_lwe_multi_bit_bootstrap_key( + &small_lwe_sk, + &output_glwe_secret_key, + &mut bsk, + glwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &*bsk, + ciphertext_modulus + )); + + let d_bsk = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(&bsk, &stream); + + while msg != 0 { + msg -= 1; + for _ in 0..NB_TESTS { + let plaintext = Plaintext(msg * delta_64); + + let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext( + &input_lwe_secret_key, + plaintext, + lwe_noise_distribution, + ciphertext_modulus_64, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &lwe_ciphertext_in, + ciphertext_modulus_64 + )); + + let d_lwe_ciphertext_in = + CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream); + let mut d_out_pbs_ct = CudaLweCiphertextList::new( + output_lwe_dimension, + LweCiphertextCount(1), + ciphertext_modulus, + &stream, + ); + let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream); + + let mut test_vector_indexes: Vec = vec![0; number_of_messages]; + for (i, ind) in test_vector_indexes.iter_mut().enumerate() { + *ind = >::cast_into(i); + } + + let mut d_test_vector_indexes = + unsafe { CudaVec::::new_async(number_of_messages, &stream, 0) }; + unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) }; + + let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0; + let lwe_indexes_usize: Vec = (0..num_blocks).collect_vec(); + let lwe_indexes = lwe_indexes_usize + .iter() + .map(|&x| >::cast_into(x)) + .collect_vec(); + let mut d_output_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + let mut d_input_indexes = unsafe { CudaVec::::new_async(num_blocks, &stream, 0) }; + unsafe { + d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); + d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0); + } + + cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext( + &d_lwe_ciphertext_in, + &mut d_out_pbs_ct, + &d_accumulator, + &d_test_vector_indexes, + &d_output_indexes, + &d_input_indexes, + &d_bsk, + &stream, + ); + + let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream); + assert!(check_encrypted_content_respects_mod( + &out_pbs_ct, + ciphertext_modulus + )); + + let decrypted = decrypt_lwe_ciphertext(&output_lwe_secret_key, &out_pbs_ct); + + let decoded = round_decode(decrypted.0, delta) % msg_modulus.0 as u128; + assert_eq!(decoded, f(msg as u128)); + } + } +} + +#[test] +fn test_multibit_bootstrap_u128_with_squashing() { + execute_multibit_bootstrap_u128( + NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ); +} diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs index 9847661ad..d8d986245 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs @@ -7,11 +7,13 @@ mod glwe_sample_extraction; mod lwe_keyswitch; mod lwe_linear_algebra; mod lwe_multi_bit_programmable_bootstrapping; +mod lwe_multi_bit_programmable_bootstrapping_128; mod lwe_packing_keyswitch; mod lwe_programmable_bootstrapping; mod lwe_programmable_bootstrapping_128; mod modulus_switch_noise_reduction; mod noise_distribution; + pub struct CudaPackingKeySwitchKeys { pub lwe_sk: LweSecretKey>, pub glwe_sk: GlweSecretKey>, diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs index 3d23c68ea..0ad4bc183 100644 --- a/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs +++ b/tfhe/src/core_crypto/gpu/entities/lwe_multi_bit_bootstrap_key.rs @@ -10,9 +10,9 @@ use crate::core_crypto::prelude::{ /// A structure representing a vector of GLWE ciphertexts with 64 bits of precision on the GPU. #[derive(Debug)] -pub struct CudaLweMultiBitBootstrapKey { +pub struct CudaLweMultiBitBootstrapKey { // Pointers to GPU data - pub(crate) d_vec: CudaVec, + pub(crate) d_vec: CudaVec, // Lwe dimension pub(crate) input_lwe_dimension: LweDimension, // Glwe dimension @@ -27,14 +27,11 @@ pub struct CudaLweMultiBitBootstrapKey { pub(crate) grouping_factor: LweBskGroupingFactor, } -impl CudaLweMultiBitBootstrapKey { - pub fn from_lwe_multi_bit_bootstrap_key( +impl CudaLweMultiBitBootstrapKey { + pub fn from_lwe_multi_bit_bootstrap_key>( bsk: &LweMultiBitBootstrapKey, streams: &CudaStreams, - ) -> Self - where - InputBskCont::Element: UnsignedInteger, - { + ) -> Self { let input_lwe_dimension = bsk.input_lwe_dimension(); let polynomial_size = bsk.polynomial_size(); let decomp_level_count = bsk.decomposition_level_count(); @@ -43,7 +40,7 @@ impl CudaLweMultiBitBootstrapKey { let grouping_factor = bsk.grouping_factor(); // Allocate memory - let mut d_vec = CudaVec::::new_multi_gpu( + let mut d_vec = CudaVec::::new_multi_gpu( lwe_multi_bit_bootstrap_key_size( input_lwe_dimension, glwe_dimension.to_glwe_size(), diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index ff119c24b..1803a11d8 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -14,9 +14,11 @@ use crate::core_crypto::prelude::{ }; pub use algorithms::*; pub use entities::*; +use std::any::{Any, TypeId}; use std::ffi::c_void; use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; + pub struct CudaStreams { pub ptr: Vec<*mut c_void>, pub gpu_indexes: Vec, @@ -311,15 +313,18 @@ pub unsafe fn programmable_bootstrap_128_async( /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is /// required #[allow(clippy::too_many_arguments)] -pub unsafe fn programmable_bootstrap_multi_bit_async( +pub unsafe fn programmable_bootstrap_multi_bit_async< + T: UnsignedInteger, + B: Any + UnsignedInteger, +>( streams: &CudaStreams, - lwe_array_out: &mut CudaVec, + lwe_array_out: &mut CudaVec, output_indexes: &CudaVec, - test_vector: &CudaVec, + test_vector: &CudaVec, test_vector_indexes: &CudaVec, lwe_array_in: &CudaVec, input_indexes: &CudaVec, - bootstrapping_key: &CudaVec, + bootstrapping_key: &CudaVec, lwe_dimension: LweDimension, glwe_dimension: GlweDimension, polynomial_size: PolynomialSize, @@ -331,42 +336,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( let num_many_lut = 1u32; let lut_stride = 0u32; let mut pbs_buffer: *mut i8 = std::ptr::null_mut(); - scratch_cuda_multi_bit_programmable_bootstrap_64( - streams.ptr[0], - streams.gpu_indexes[0].get(), - std::ptr::addr_of_mut!(pbs_buffer), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - level.0 as u32, - num_samples, - true, - ); - cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - streams.ptr[0], - streams.gpu_indexes[0].get(), - lwe_array_out.as_mut_c_ptr(0), - output_indexes.as_c_ptr(0), - test_vector.as_c_ptr(0), - test_vector_indexes.as_c_ptr(0), - lwe_array_in.as_c_ptr(0), - input_indexes.as_c_ptr(0), - bootstrapping_key.as_c_ptr(0), - pbs_buffer, - lwe_dimension.0 as u32, - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - grouping_factor.0 as u32, - base_log.0 as u32, - level.0 as u32, - num_samples, - num_many_lut, - lut_stride, - ); - cleanup_cuda_multi_bit_programmable_bootstrap( - streams.ptr[0], - streams.gpu_indexes[0].get(), - std::ptr::addr_of_mut!(pbs_buffer), - ); + if TypeId::of::() == TypeId::of::() { + scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + level.0 as u32, + num_samples, + true, + ); + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( + streams.ptr[0], + streams.gpu_indexes[0].get(), + lwe_array_out.as_mut_c_ptr(0), + output_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + input_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), + pbs_buffer, + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + base_log.0 as u32, + level.0 as u32, + num_samples, + num_many_lut, + lut_stride, + ); + cleanup_cuda_multi_bit_programmable_bootstrap_128( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + ); + } else if TypeId::of::() == TypeId::of::() { + scratch_cuda_multi_bit_programmable_bootstrap_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + level.0 as u32, + num_samples, + true, + ); + cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( + streams.ptr[0], + streams.gpu_indexes[0].get(), + lwe_array_out.as_mut_c_ptr(0), + output_indexes.as_c_ptr(0), + test_vector.as_c_ptr(0), + test_vector_indexes.as_c_ptr(0), + lwe_array_in.as_c_ptr(0), + input_indexes.as_c_ptr(0), + bootstrapping_key.as_c_ptr(0), + pbs_buffer, + lwe_dimension.0 as u32, + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + base_log.0 as u32, + level.0 as u32, + num_samples, + num_many_lut, + lut_stride, + ); + cleanup_cuda_multi_bit_programmable_bootstrap( + streams.ptr[0], + streams.gpu_indexes[0].get(), + std::ptr::addr_of_mut!(pbs_buffer), + ); + } else { + panic!("Unsupported torus size") + } } #[allow(clippy::too_many_arguments)] @@ -607,9 +653,9 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async( /// [CudaStreams::synchronize] __must__ be called as soon as synchronization is /// required #[allow(clippy::too_many_arguments)] -pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async( +pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async( streams: &CudaStreams, - dest: &mut CudaVec, + dest: &mut CudaVec, src: &[T], input_lwe_dim: LweDimension, glwe_dim: GlweDimension, @@ -620,17 +666,34 @@ pub unsafe fn convert_lwe_multi_bit_programmable_bootstrap_key_async(), size); - cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - stream_ptr, - streams.gpu_indexes[i].get(), - dest.as_mut_c_ptr(i as u32), - src.as_ptr().cast(), - input_lwe_dim.0 as u32, - glwe_dim.0 as u32, - l_gadget.0 as u32, - polynomial_size.0 as u32, - grouping_factor.0 as u32, - ); + + if TypeId::of::() == TypeId::of::() { + cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128( + stream_ptr, + streams.gpu_indexes[i].get(), + dest.as_mut_c_ptr(i as u32), + src.as_ptr().cast(), + input_lwe_dim.0 as u32, + glwe_dim.0 as u32, + l_gadget.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + ); + } else if TypeId::of::() == TypeId::of::() { + cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( + stream_ptr, + streams.gpu_indexes[i].get(), + dest.as_mut_c_ptr(i as u32), + src.as_ptr().cast(), + input_lwe_dim.0 as u32, + glwe_dim.0 as u32, + l_gadget.0 as u32, + polynomial_size.0 as u32, + grouping_factor.0 as u32, + ); + } else { + panic!("Unsupported torus size for bsk conversion") + } } } diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index 18d1acfd5..244d12706 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -30,7 +30,7 @@ pub struct CudaCompressionKey { } pub struct CudaDecompressionKey { - pub blind_rotate_key: CudaBootstrappingKey, + pub blind_rotate_key: CudaBootstrappingKey, pub lwe_per_glwe: LweCiphertextCount, pub glwe_dimension: GlweDimension, pub polynomial_size: PolynomialSize, diff --git a/tfhe/src/integer/gpu/server_key/mod.rs b/tfhe/src/integer/gpu/server_key/mod.rs index 6d37feac0..4920e2ed1 100644 --- a/tfhe/src/integer/gpu/server_key/mod.rs +++ b/tfhe/src/integer/gpu/server_key/mod.rs @@ -22,9 +22,9 @@ use crate::shortint::{CarryModulus, CiphertextModulus, MessageModulus, PBSOrder} mod radix; -pub enum CudaBootstrappingKey { +pub enum CudaBootstrappingKey { Classic(CudaLweBootstrapKey), - MultiBit(CudaLweMultiBitBootstrapKey), + MultiBit(CudaLweMultiBitBootstrapKey), } /// A structure containing the server public key. @@ -34,7 +34,7 @@ pub enum CudaBootstrappingKey { // #[derive(PartialEq, Serialize, Deserialize)] pub struct CudaServerKey { pub key_switching_key: CudaLweKeyswitchKey, - pub bootstrapping_key: CudaBootstrappingKey, + pub bootstrapping_key: CudaBootstrappingKey, // Size of the message buffer pub message_modulus: MessageModulus, // Size of the carry buffer diff --git a/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs b/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs index 7a05295e7..3cd79fa62 100644 --- a/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs +++ b/tfhe/src/shortint/backward_compatibility/parameters/noise_squashing.rs @@ -1,6 +1,6 @@ use crate::core_crypto::prelude::*; use crate::shortint::parameters::noise_squashing::{ - NoiseSquashingCompressionParameters, NoiseSquashingParameters, + NoiseSquashingCompressionParameters, NoiseSquashingMultiBitParameters, NoiseSquashingParameters, }; use crate::shortint::parameters::{ CoreCiphertextModulus, ModulusSwitchNoiseReductionParams, ModulusSwitchType, @@ -69,3 +69,8 @@ pub enum NoiseSquashingParametersVersions { pub enum NoiseSquashingCompressionParametersVersions { V0(NoiseSquashingCompressionParameters), } + +#[derive(VersionsDispatch)] +pub enum NoiseSquashingMultiBitParametersVersions { + V0(NoiseSquashingMultiBitParameters), +} diff --git a/tfhe/src/shortint/keycache.rs b/tfhe/src/shortint/keycache.rs index 770de5e5d..7513a7288 100644 --- a/tfhe/src/shortint/keycache.rs +++ b/tfhe/src/shortint/keycache.rs @@ -1,18 +1,19 @@ use std::sync::LazyLock; +use super::atomic_pattern::AtomicPatternParameters; use crate::keycache::utils::named_params_impl; use crate::keycache::*; #[cfg(tarpaulin)] use crate::shortint::parameters::coverage_parameters::*; use crate::shortint::parameters::current_params::*; +use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters; use crate::shortint::parameters::parameters_wopbs::*; +use crate::shortint::parameters::v1_3::V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; use crate::shortint::parameters::*; use crate::shortint::wopbs::WopbsKey; use crate::shortint::{ClientKey, KeySwitchingKey, ServerKey}; use serde::{Deserialize, Serialize}; -use super::atomic_pattern::AtomicPatternParameters; - named_params_impl!( ShortintParameterSet => V1_3_PARAM_MESSAGE_1_CARRY_0_KS_PBS_GAUSSIAN_2M128, V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128, @@ -492,6 +493,10 @@ named_params_impl!( NoiseSquashingParameters => V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, ); +named_params_impl!( NoiseSquashingMultiBitParameters => + V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, +); + impl From for (ClientKey, ServerKey) { fn from(param: AtomicPatternParameters) -> Self { let param_set = ShortintParameterSet::from(param); diff --git a/tfhe/src/shortint/parameters/aliases.rs b/tfhe/src/shortint/parameters/aliases.rs index 42740f460..802ade433 100644 --- a/tfhe/src/shortint/parameters/aliases.rs +++ b/tfhe/src/shortint/parameters/aliases.rs @@ -46,8 +46,13 @@ use current_params::multi_bit::tuniform::p_fail_2_minus_64::ks_pbs_gpu::{ }; use current_params::noise_squashing::p_fail_2_minus_128::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; -use super::current_params::V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +use super::current_params::{ + V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, +}; use super::NoiseSquashingCompressionParameters; +use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters; + // Aliases // Compute Gaussian @@ -125,6 +130,10 @@ pub const NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingParameters = V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +pub const NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: + NoiseSquashingMultiBitParameters = + V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + // GPU 2^-64 // GPU TUniform pub const PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M64: diff --git a/tfhe/src/shortint/parameters/noise_squashing.rs b/tfhe/src/shortint/parameters/noise_squashing.rs index d736de54c..5f79451ce 100644 --- a/tfhe/src/shortint/parameters/noise_squashing.rs +++ b/tfhe/src/shortint/parameters/noise_squashing.rs @@ -1,3 +1,4 @@ +use crate::core_crypto::prelude::LweBskGroupingFactor; use crate::shortint::backward_compatibility::parameters::noise_squashing::*; use crate::shortint::parameters::{ CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, @@ -34,3 +35,17 @@ pub struct NoiseSquashingCompressionParameters { pub carry_modulus: CarryModulus, pub ciphertext_modulus: CoreCiphertextModulus, } + +#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize, Versionize)] +#[versionize(NoiseSquashingMultiBitParametersVersions)] +pub struct NoiseSquashingMultiBitParameters { + pub glwe_dimension: GlweDimension, + pub polynomial_size: PolynomialSize, + pub glwe_noise_distribution: DynamicDistribution, + pub decomp_base_log: DecompositionBaseLog, + pub decomp_level_count: DecompositionLevelCount, + pub grouping_factor: LweBskGroupingFactor, + pub message_modulus: MessageModulus, + pub carry_modulus: CarryModulus, + pub ciphertext_modulus: CoreCiphertextModulus, +} diff --git a/tfhe/src/shortint/parameters/v1_3/mod.rs b/tfhe/src/shortint/parameters/v1_3/mod.rs index 4c4810a73..91958138b 100644 --- a/tfhe/src/shortint/parameters/v1_3/mod.rs +++ b/tfhe/src/shortint/parameters/v1_3/mod.rs @@ -43,6 +43,7 @@ pub use noise_squashing::p_fail_2_minus_128::*; #[cfg(feature = "hpu")] pub use hpu::*; +use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters; use crate::shortint::parameters::{ ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters, KeySwitch32PBSParameters, MultiBitPBSParameters, NoiseSquashingCompressionParameters, @@ -1700,6 +1701,14 @@ pub const VEC_ALL_NOISE_SQUASHING_PARAMETERS: [(&NoiseSquashingParameters, &str) ), ]; +pub const VEC_ALL_NOISE_SQUASHING_MULTI_BIT_PARAMETERS: [( + &NoiseSquashingMultiBitParameters, + &str, +); 1] = [( + &V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + "V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128", +)]; + /// All [`NoiseSquashingCompressionParameters`] in this module. pub const VEC_ALL_NOISE_SQUASHING_COMPRESSION_PARAMETERS: [( &NoiseSquashingCompressionParameters, diff --git a/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs b/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs index 12a5ca832..d78e5ae52 100644 --- a/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs +++ b/tfhe/src/shortint/parameters/v1_3/noise_squashing/p_fail_2_minus_128/mod.rs @@ -1,6 +1,7 @@ +use crate::shortint::parameters::noise_squashing::NoiseSquashingMultiBitParameters; use crate::shortint::parameters::{ CarryModulus, CoreCiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, - DynamicDistribution, GlweDimension, LweCiphertextCount, MessageModulus, + DynamicDistribution, GlweDimension, LweBskGroupingFactor, LweCiphertextCount, MessageModulus, ModulusSwitchNoiseReductionParams, ModulusSwitchType, NoiseEstimationMeasureBound, NoiseSquashingCompressionParameters, NoiseSquashingParameters, PolynomialSize, RSigmaFactor, Variance, @@ -58,3 +59,16 @@ pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128 carry_modulus: CarryModulus(4), ciphertext_modulus: CoreCiphertextModulus::::new_native(), }; + +pub const V1_3_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: + NoiseSquashingMultiBitParameters = NoiseSquashingMultiBitParameters { + glwe_dimension: GlweDimension(2), + polynomial_size: PolynomialSize(2048), + glwe_noise_distribution: DynamicDistribution::new_t_uniform(30), + decomp_base_log: DecompositionBaseLog(23), + decomp_level_count: DecompositionLevelCount(3), + grouping_factor: LweBskGroupingFactor(4), + message_modulus: MessageModulus(4), + carry_modulus: CarryModulus(4), + ciphertext_modulus: CoreCiphertextModulus::::new_native(), +};