mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
feat(gpu): add modulus switch noise reduction gpu
This commit is contained in:
committed by
Agnès Leroy
parent
ac4d36d6f6
commit
9eb6d5afd1
@@ -4,6 +4,7 @@
|
||||
#include "stdint.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
|
||||
uint32_t gpu_index,
|
||||
void *dest, void const *src,
|
||||
@@ -20,5 +21,16 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
uint32_t const *nth_array, uint32_t num_nths,
|
||||
uint32_t lwe_per_glwe, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size);
|
||||
|
||||
void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, uint32_t size,
|
||||
uint32_t log_modulus);
|
||||
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -58,7 +58,7 @@ void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
|
||||
@@ -20,8 +20,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
@@ -55,7 +55,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory);
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
void scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
@@ -63,12 +63,15 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory);
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -82,15 +85,16 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory);
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift);
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -101,8 +105,9 @@ void cuda_apply_many_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_luts,
|
||||
uint32_t lut_stride);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_luts, uint32_t lut_stride);
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -110,14 +115,13 @@ void scratch_cuda_full_propagation_64(
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_full_propagation_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
void cuda_full_propagation_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *input_blocks, int8_t *mem_ptr, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_full_propagation(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -130,15 +134,16 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
|
||||
uint32_t polynomial_size, uint32_t num_blocks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_mult(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -163,12 +168,13 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -177,12 +183,13 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -199,12 +206,13 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool is_signed, bool allocate_gpu_memory);
|
||||
bool is_signed, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -218,21 +226,25 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_scalar_blocks);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_scalar_blocks);
|
||||
|
||||
void cleanup_cuda_integer_comparison(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -245,21 +257,23 @@ void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory);
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_bitop(void *const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -272,7 +286,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -280,7 +294,8 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -293,12 +308,13 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -312,7 +328,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory);
|
||||
uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -321,19 +337,22 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory);
|
||||
uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
@@ -353,15 +372,16 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow);
|
||||
|
||||
void cleanup_cuda_integer_overflowing_sub(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -375,13 +395,14 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -393,14 +414,15 @@ void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t num_scalars);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -414,14 +436,15 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
|
||||
CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -434,13 +457,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory);
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks);
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -458,12 +483,13 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory);
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
@@ -477,13 +503,15 @@ void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks);
|
||||
|
||||
void cleanup_cuda_integer_are_all_comparisons_block_true(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -496,13 +524,15 @@ void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks);
|
||||
|
||||
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
@@ -94,6 +94,7 @@ struct int_radix_params {
|
||||
uint32_t grouping_factor;
|
||||
uint32_t message_modulus;
|
||||
uint32_t carry_modulus;
|
||||
bool allocate_ms_array;
|
||||
|
||||
int_radix_params(){};
|
||||
|
||||
@@ -102,13 +103,16 @@ struct int_radix_params {
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus)
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
bool allocate_ms_array)
|
||||
|
||||
: pbs_type(pbs_type), glwe_dimension(glwe_dimension),
|
||||
polynomial_size(polynomial_size), big_lwe_dimension(big_lwe_dimension),
|
||||
small_lwe_dimension(small_lwe_dimension), ks_level(ks_level),
|
||||
ks_base_log(ks_base_log), pbs_level(pbs_level),
|
||||
pbs_base_log(pbs_base_log), grouping_factor(grouping_factor),
|
||||
message_modulus(message_modulus), carry_modulus(carry_modulus){};
|
||||
message_modulus(message_modulus), carry_modulus(carry_modulus),
|
||||
allocate_ms_array(allocate_ms_array){};
|
||||
|
||||
void print() {
|
||||
printf("pbs_type: %u, glwe_dimension: %u, "
|
||||
@@ -198,7 +202,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
|
||||
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
|
||||
params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
allocate_gpu_memory, params.allocate_ms_array);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
@@ -394,7 +398,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
|
||||
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
|
||||
params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
allocate_gpu_memory, params.allocate_ms_array);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
@@ -1279,7 +1283,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
int_radix_params params, uint32_t group_size,
|
||||
uint32_t big_lwe_size_bytes,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
#ifndef CUDA_PBS_ENUMS_H
|
||||
#define CUDA_PBS_ENUMS_H
|
||||
|
||||
#include <stdint.h>
|
||||
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
|
||||
|
||||
extern "C" {
|
||||
typedef struct {
|
||||
void *const *ptr;
|
||||
uint32_t num_zeros;
|
||||
double ms_bound;
|
||||
double ms_r_sigma;
|
||||
double ms_input_variance;
|
||||
} CudaModulusSwitchNoiseReductionKeyFFI;
|
||||
}
|
||||
|
||||
#endif // CUDA_PBS_ENUMS_H
|
||||
|
||||
@@ -76,18 +76,26 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
Torus *global_accumulator;
|
||||
double2 *global_join_buffer;
|
||||
Torus *temp_lwe_array_in;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
bool uses_noise_reduction;
|
||||
|
||||
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory) {
|
||||
pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
cuda_set_device(gpu_index);
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
|
||||
if (allocate_ms_array) {
|
||||
this->temp_lwe_array_in = (Torus *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::DEFAULT: {
|
||||
@@ -218,6 +226,9 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
|
||||
if (uses_noise_reduction)
|
||||
cuda_drop_async(temp_lwe_array_in, stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -228,16 +239,25 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
__uint128_t *temp_lwe_array_in;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
bool uses_noise_reduction;
|
||||
|
||||
pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
if (allocate_ms_array) {
|
||||
this->temp_lwe_array_in = (__uint128_t *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
}
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
@@ -367,6 +387,9 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
|
||||
if (uses_noise_reduction)
|
||||
cuda_drop_async(temp_lwe_array_in, stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -439,21 +462,24 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
|
||||
@@ -58,19 +58,22 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -86,6 +89,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
|
||||
@@ -93,9 +97,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
@@ -75,3 +75,24 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
"N's are powers of two in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, uint32_t size,
|
||||
uint32_t log_modulus) {
|
||||
host_modulus_switch_inplace<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out), size, log_modulus);
|
||||
}
|
||||
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
|
||||
num_zeros, input_variance, r_sigma, bound, log_modulus);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
#ifndef CNCRT_TORUS_CUH
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "device.h"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
@@ -115,8 +117,180 @@ __host__ void host_modulus_switch_inplace(cudaStream_t stream,
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
|
||||
|
||||
modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
|
||||
log_modulus);
|
||||
modulus_switch_inplace<Torus>
|
||||
<<<num_blocks, num_threads, 0, stream>>>(array, size, log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ double round_error_double(T input,
|
||||
uint32_t log_modulus) {
|
||||
T rounded;
|
||||
constexpr uint32_t BITS = sizeof(T) * 8;
|
||||
modulus_switch<T>(input, rounded, log_modulus);
|
||||
rounded <<= (BITS - log_modulus);
|
||||
rounded -= input;
|
||||
return __ll2double_rn((int64_t)rounded);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ double measure_modulus_switch_noise(
|
||||
T input1, T input2, uint32_t log_modulus, uint32_t lwe_size,
|
||||
double *sum_mask_errors, double *sum_squared_mask_errors, double *body,
|
||||
double input_variance, double r_sigma, double bound) {
|
||||
|
||||
double input_double1 = round_error_double<T>(input1, log_modulus);
|
||||
double input_double2 = round_error_double<T>(input2, log_modulus);
|
||||
|
||||
if (threadIdx.x + blockDim.x == lwe_size - 1) {
|
||||
body[0] = input_double2;
|
||||
}
|
||||
// Here we are assuming that lwe is at least 512 so all threads will work
|
||||
sum_mask_errors[threadIdx.x] = input_double1;
|
||||
sum_squared_mask_errors[threadIdx.x] = input_double1 * input_double1;
|
||||
|
||||
if (threadIdx.x + blockDim.x < lwe_size - 1) {
|
||||
sum_mask_errors[threadIdx.x] += input_double2;
|
||||
sum_squared_mask_errors[threadIdx.x] += input_double2 * input_double2;
|
||||
}
|
||||
|
||||
// We need to perform a reduction to get the expectancy and variance
|
||||
for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
|
||||
__syncthreads();
|
||||
if (threadIdx.x < offset) {
|
||||
sum_mask_errors[threadIdx.x] += sum_mask_errors[threadIdx.x + offset];
|
||||
sum_squared_mask_errors[threadIdx.x] +=
|
||||
sum_squared_mask_errors[threadIdx.x + offset];
|
||||
}
|
||||
}
|
||||
|
||||
// Thread 0 has the sum of the mask errors and calculates the noise
|
||||
double noise = 0;
|
||||
if (threadIdx.x == 0) {
|
||||
double expectancy = body[threadIdx.x] - sum_mask_errors[threadIdx.x] / 2.0f;
|
||||
double variance = sum_squared_mask_errors[threadIdx.x] / 4.0f;
|
||||
double std_dev = sqrt(variance + input_variance);
|
||||
noise = abs(expectancy) + std_dev * r_sigma;
|
||||
}
|
||||
__syncthreads();
|
||||
return noise; // only thread 0 will return the correct noise
|
||||
}
|
||||
|
||||
// Each thread processes two elements of the lwe array
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
const Torus *zeros, int lwe_size, int num_zeros,
|
||||
double input_variance, double r_sigma,
|
||||
double bound, uint32_t log_modulus) {
|
||||
|
||||
// First we will assume size is less than the number of threads per block
|
||||
// I should switch this to dynamic shared memory
|
||||
__shared__ double sum_mask_errors[512];
|
||||
__shared__ double sum_squared_mask_errors[512];
|
||||
__shared__ double body[1];
|
||||
__shared__ bool found;
|
||||
|
||||
// We need to initialize the shared memory
|
||||
if (threadIdx.x == 0)
|
||||
found = false;
|
||||
__syncthreads();
|
||||
// This probably are not needed cause we are setting the values
|
||||
sum_mask_errors[threadIdx.x] = 0.f;
|
||||
sum_squared_mask_errors[threadIdx.x] = 0.f;
|
||||
|
||||
Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
|
||||
|
||||
Torus input_element2 =
|
||||
threadIdx.x + blockDim.x < lwe_size
|
||||
? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
|
||||
: 0;
|
||||
|
||||
// Base noise is only handled by thread 0
|
||||
double base_noise = measure_modulus_switch_noise<Torus>(
|
||||
input_element1, input_element2, log_modulus, lwe_size, sum_mask_errors,
|
||||
sum_squared_mask_errors, body, input_variance, r_sigma, bound);
|
||||
|
||||
// If the noise is less than the bound we can just copy the input
|
||||
if (base_noise <= bound && threadIdx.x == 0) {
|
||||
found = true;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (found)
|
||||
array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
|
||||
|
||||
if (found && (threadIdx.x + blockDim.x) < lwe_size)
|
||||
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
|
||||
input_element2;
|
||||
|
||||
__syncthreads();
|
||||
// If we found a zero element we stop iterating (in avg 20 times are
|
||||
// required)
|
||||
if (found)
|
||||
return;
|
||||
|
||||
// Now we need to start testing the other zero_elements
|
||||
for (int index = 0; index < num_zeros; index++) {
|
||||
|
||||
Torus zero_element1 =
|
||||
zeros[threadIdx.x + index * lwe_size] + input_element1;
|
||||
Torus zero_element2 =
|
||||
threadIdx.x + blockDim.x < lwe_size
|
||||
? zeros[threadIdx.x + blockDim.x + index * lwe_size] +
|
||||
input_element2
|
||||
: 0;
|
||||
// Index noise is only handled by thread 0
|
||||
// Measuring the potential noise is costly cause requires a reduction
|
||||
double index_noise = measure_modulus_switch_noise<Torus>(
|
||||
zero_element1, zero_element2, log_modulus, lwe_size, sum_mask_errors,
|
||||
sum_squared_mask_errors, body, input_variance, r_sigma, bound);
|
||||
|
||||
if (index_noise <= bound && threadIdx.x == 0) {
|
||||
found = true;
|
||||
}
|
||||
__syncthreads();
|
||||
// Assumption we always have at least 512 elements
|
||||
// If we find a useful zero encryption we replace the lwe by lwe + zero
|
||||
if (found)
|
||||
array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
|
||||
|
||||
if (found && (threadIdx.x + blockDim.x) < lwe_size)
|
||||
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
|
||||
zero_element2;
|
||||
|
||||
__syncthreads();
|
||||
// If we found a zero element we stop iterating (in avg 20 times are
|
||||
// required)
|
||||
if (found)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_improve_noise_modulus_switch(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
|
||||
Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
|
||||
const double r_sigma, const double bound, uint32_t log_modulus) {
|
||||
|
||||
if (lwe_size < 512) {
|
||||
PANIC("The lwe_size is less than 512, this is not supported\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (lwe_size > 1024) {
|
||||
PANIC("The lwe_size is greater than 1024, this is not supported\n");
|
||||
return;
|
||||
}
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// This reduction requires a power of two num of threads
|
||||
int num_threads = 512, num_blocks = num_lwes;
|
||||
|
||||
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
|
||||
bound, log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
}
|
||||
|
||||
/// Copy memory within a GPU
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_abs_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -23,13 +23,14 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
|
||||
is_signed);
|
||||
gpu_count, ct, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem, is_signed);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(void *const *streams,
|
||||
|
||||
@@ -29,11 +29,12 @@ __host__ void scratch_cuda_integer_abs_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
|
||||
__host__ void host_integer_abs_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *ct, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
|
||||
if (!is_signed)
|
||||
return;
|
||||
|
||||
@@ -47,18 +48,19 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
|
||||
ct->num_radix_blocks);
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
|
||||
nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, requested_flag, uses_carry);
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
|
||||
ct, mem_ptr->bitxor_mem, bsks, ksks);
|
||||
ct, mem_ptr->bitxor_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_ABS_CUH
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_radix_bitop_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory) {
|
||||
BITOP_TYPE op_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -25,12 +25,13 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(void *const *streams,
|
||||
|
||||
@@ -17,7 +17,8 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
|
||||
lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
|
||||
@@ -43,7 +44,7 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
bsks, ksks, lut, lwe_array_out->num_radix_blocks,
|
||||
bsks, ksks, ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
|
||||
lut->params.message_modulus);
|
||||
|
||||
memcpy(lwe_array_out->degrees, degrees,
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_radix_cmux_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
@@ -29,12 +29,14 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_condition, lwe_array_true, lwe_array_false,
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(void *const *streams,
|
||||
|
||||
@@ -5,14 +5,16 @@
|
||||
#include "radix_ciphertext.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void zero_out_if(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void
|
||||
zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
||||
lwe_array_input->num_radix_blocks < num_radix_blocks)
|
||||
PANIC("Cuda error: input or output radix ciphertexts does not have enough "
|
||||
@@ -34,7 +36,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
|
||||
ksks, predicate, num_radix_blocks);
|
||||
ksks, ms_noise_reduction_key, predicate, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -44,7 +46,8 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
@@ -67,8 +70,8 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
}
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
|
||||
mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
|
||||
2 * num_radix_blocks, params.message_modulus);
|
||||
mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);
|
||||
|
||||
// If the condition was true, true_ct will have kept its value and false_ct
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
@@ -85,7 +88,7 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
|
||||
mem_ptr->message_extract_lut, num_radix_blocks);
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -7,12 +7,13 @@ void scratch_cuda_integer_radix_comparison_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory) {
|
||||
COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
switch (op_type) {
|
||||
case EQ:
|
||||
@@ -41,7 +42,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
if (lwe_array_1->num_radix_blocks != lwe_array_1->num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks must be the same")
|
||||
@@ -57,7 +59,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -69,7 +71,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, buffer->diff_buffer->operator_f, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
@@ -78,7 +80,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
@@ -102,12 +104,12 @@ void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -119,14 +121,17 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), num_radix_blocks);
|
||||
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_are_all_comparisons_block_true(
|
||||
@@ -145,12 +150,12 @@ void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -162,14 +167,17 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), num_radix_blocks);
|
||||
lwe_array_in, buffer, bsks, (uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
|
||||
@@ -62,7 +62,9 @@ __host__ void are_all_comparisons_block_true(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -156,7 +158,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
|
||||
ksks, lut, 1);
|
||||
ksks, ms_noise_reduction_key, lut, 1);
|
||||
// Reset max_value_lut_indexes before returning, otherwise if the lut is
|
||||
// reused the lut indexes will be wrong
|
||||
memset(is_max_value_lut->h_lut_indexes, 0,
|
||||
@@ -171,7 +173,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
|
||||
lut, num_chunks);
|
||||
ms_noise_reduction_key, lut, num_chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -188,7 +190,9 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -242,12 +246,14 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
buffer->tmp_block_accumulated, bsks, ksks, lut, 1);
|
||||
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, 1);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
buffer->tmp_block_accumulated, bsks, ksks, lut, num_chunks);
|
||||
buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, num_chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -258,8 +264,9 @@ __host__ void host_compare_blocks_with_zero(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, int32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *zero_comparison) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
@@ -316,7 +323,8 @@ __host__ void host_compare_blocks_with_zero(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
|
||||
zero_comparison, num_sum_blocks);
|
||||
ms_noise_reduction_key, zero_comparison, num_sum_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
|
||||
}
|
||||
|
||||
@@ -327,7 +335,9 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
@@ -338,16 +348,16 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
|
||||
bsks, ksks, eq_buffer->operator_lut, num_radix_blocks,
|
||||
eq_buffer->operator_lut->params.message_modulus);
|
||||
bsks, ksks, ms_noise_reduction_key, eq_buffer->operator_lut,
|
||||
num_radix_blocks, eq_buffer->operator_lut->params.message_modulus);
|
||||
|
||||
// This takes a Vec of blocks, where each block is either 0 or 1.
|
||||
//
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, comparisons, mem_ptr,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, comparisons, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -357,7 +367,9 @@ __host__ void compare_radix_blocks_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -391,7 +403,7 @@ __host__ void compare_radix_blocks_kb(
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
|
||||
is_non_zero_lut, num_radix_blocks);
|
||||
ms_noise_reduction_key, is_non_zero_lut, num_radix_blocks);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
@@ -405,14 +417,15 @@ __host__ void compare_radix_blocks_kb(
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
tree_sign_reduction(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
__host__ void tree_sign_reduction(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_block_comparisons->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -446,8 +459,8 @@ tree_sign_reduction(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
4);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, x, y, bsks, ksks, inner_tree_leaf,
|
||||
partial_block_count >> 1);
|
||||
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
|
||||
ms_noise_reduction_key, inner_tree_leaf, partial_block_count >> 1);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
partial_block_count >>= 1;
|
||||
@@ -489,8 +502,8 @@ tree_sign_reduction(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, last_lut,
|
||||
1);
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks,
|
||||
ms_noise_reduction_key, last_lut, 1);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -501,7 +514,9 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -539,8 +554,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, diff_buffer->tmp_packed,
|
||||
diff_buffer->tmp_packed, bsks, ksks, identity_lut,
|
||||
2 * packed_num_radix_blocks);
|
||||
diff_buffer->tmp_packed, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 2 * packed_num_radix_blocks);
|
||||
} else {
|
||||
as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
|
||||
lwe_array_left->num_radix_blocks);
|
||||
@@ -557,17 +572,17 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (!mem_ptr->is_signed) {
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
|
||||
&lhs, &rhs, mem_ptr, bsks, ksks,
|
||||
packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
comparisons, &lhs, &rhs, mem_ptr, bsks,
|
||||
ksks, packed_num_radix_blocks);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -581,7 +596,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &last_left_block_before_sign_block,
|
||||
&shifted_lwe_array_left, bsks, ksks, identity_lut, 1);
|
||||
&shifted_lwe_array_left, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 1);
|
||||
|
||||
CudaRadixCiphertextFFI last_right_block_before_sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -594,7 +610,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &last_right_block_before_sign_block,
|
||||
&shifted_lwe_array_right, bsks, ksks, identity_lut, 1);
|
||||
&shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 1);
|
||||
|
||||
CudaRadixCiphertextFFI shifted_comparisons;
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -603,7 +620,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
||||
&last_left_block_before_sign_block,
|
||||
&last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);
|
||||
&last_right_block_before_sign_block, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, 1);
|
||||
|
||||
// Compare the sign block separately
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -617,14 +635,16 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
||||
&last_left_block, &last_right_block, bsks, ksks, mem_ptr->signed_lut,
|
||||
1, mem_ptr->signed_lut->params.message_modulus);
|
||||
&last_left_block, &last_right_block, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
|
||||
lwe_array_right, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
CudaRadixCiphertextFFI shifted_comparisons;
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -637,8 +657,9 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_comparisons,
|
||||
&last_left_block, &last_right_block, bsks, ksks, mem_ptr->signed_lut,
|
||||
1, mem_ptr->signed_lut->params.message_modulus);
|
||||
&last_left_block, &last_right_block, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = num_radix_blocks;
|
||||
}
|
||||
}
|
||||
@@ -648,7 +669,8 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, num_comparisons);
|
||||
reduction_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_comparisons);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -670,7 +692,9 @@ __host__ void host_integer_radix_maxmin_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -685,13 +709,13 @@ __host__ void host_integer_radix_maxmin_kb(
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
|
||||
ksks, num_radix_blocks);
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -700,13 +724,15 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, lwe_array_in, mem_ptr,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -715,12 +741,14 @@ __host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -13,7 +13,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
(compression_glwe_dimension + 1) * compression_polynomial_size,
|
||||
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
|
||||
carry_modulus);
|
||||
carry_modulus, allocate_gpu_memory);
|
||||
|
||||
scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -28,19 +28,20 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
// Decompression doesn't keyswitch, so big and small dimensions are the same
|
||||
int_radix_params encryption_params(
|
||||
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
|
||||
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
|
||||
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
|
||||
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
|
||||
@@ -313,7 +313,7 @@ __host__ void host_integer_decompress(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, d_bsks, lut->buffer,
|
||||
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
@@ -340,7 +340,7 @@ __host__ void host_integer_decompress(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, lut->buffer,
|
||||
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, nullptr, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, is_signed,
|
||||
@@ -25,13 +25,15 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
|
||||
CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_div_rem_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, quotient, remainder,
|
||||
numerator, divisor, is_signed, bsks, (uint64_t **)(ksks), mem);
|
||||
numerator, divisor, is_signed, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, mem);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_div_rem(void *const *streams,
|
||||
|
||||
@@ -36,7 +36,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
|
||||
|
||||
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
|
||||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
|
||||
@@ -146,7 +148,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
interesting_divisor->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &last_interesting_divisor_block,
|
||||
&last_interesting_divisor_block, bsks, ksks,
|
||||
&last_interesting_divisor_block, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->masking_luts_1[shifted_mask], 1);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
|
||||
@@ -175,7 +177,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, divisor_ms_blocks, divisor_ms_blocks,
|
||||
bsks, ksks, mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
|
||||
// This does
|
||||
@@ -199,7 +202,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder1, 1,
|
||||
mem_ptr->shift_mem_1, bsks, ksks,
|
||||
mem_ptr->shift_mem_1, bsks, ksks, ms_noise_reduction_key,
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(mem_ptr->tmp_radix,
|
||||
@@ -231,7 +234,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
uint32_t gpu_count) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, interesting_remainder2, 1,
|
||||
mem_ptr->shift_mem_2, bsks, ksks,
|
||||
mem_ptr->shift_mem_2, bsks, ksks, ms_noise_reduction_key,
|
||||
interesting_remainder2->num_radix_blocks);
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
@@ -311,8 +314,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams, gpu_indexes, gpu_count, new_remainder,
|
||||
merged_interesting_remainder, interesting_divisor,
|
||||
subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
|
||||
mem_ptr->overflow_sub_mem, bsks, ksks, compute_borrow,
|
||||
uses_input_borrow);
|
||||
mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
compute_borrow, uses_input_borrow);
|
||||
};
|
||||
|
||||
// fills:
|
||||
@@ -332,14 +335,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// So we can skip some stuff
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_1, trivial_blocks,
|
||||
mem_ptr->comparison_buffer, bsks, ksks,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
trivial_blocks->num_radix_blocks,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
|
||||
mem_ptr->comparison_buffer, bsks, ksks,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->tmp_1->num_radix_blocks);
|
||||
}
|
||||
};
|
||||
@@ -355,7 +358,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_1,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
|
||||
cleaned_merged_interesting_remainder->num_radix_blocks);
|
||||
};
|
||||
|
||||
@@ -383,10 +386,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
int factor = (i) ? 3 : 2;
|
||||
int factor_lut_id = factor - 2;
|
||||
for (size_t i = 0;
|
||||
i < cleaned_merged_interesting_remainder->num_radix_blocks; i++) {
|
||||
for (size_t k = 0;
|
||||
k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
overflow_sum_radix, i, i + 1,
|
||||
overflow_sum_radix, k, k + 1,
|
||||
overflow_sum, 0, 1);
|
||||
}
|
||||
|
||||
@@ -397,7 +400,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams, gpu_indexes, gpu_count,
|
||||
cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
|
||||
ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
ksks, ms_noise_reduction_key,
|
||||
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
cleaned_merged_interesting_remainder->num_radix_blocks, factor);
|
||||
};
|
||||
|
||||
@@ -406,7 +410,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
uint32_t gpu_count) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder, new_remainder,
|
||||
overflow_sum_radix, bsks, ksks,
|
||||
overflow_sum_radix, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
|
||||
new_remainder->num_radix_blocks, factor);
|
||||
};
|
||||
@@ -418,7 +422,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->did_not_overflow,
|
||||
subtraction_overflowed, at_least_one_upper_block_is_non_zero, bsks,
|
||||
ksks, mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
|
||||
ksks, ms_noise_reduction_key,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]
|
||||
->params.message_modulus);
|
||||
|
||||
@@ -476,10 +481,11 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
|
||||
bsks, ksks, mem_ptr->message_extract_lut_1, num_blocks);
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
|
||||
num_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
|
||||
ksks, mem_ptr->message_extract_lut_2, num_blocks);
|
||||
ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
@@ -492,7 +498,9 @@ __host__ void host_integer_div_rem_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
|
||||
uint64_t *const *ksks, int_div_rem_memory<uint64_t> *int_mem_ptr) {
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_div_rem_memory<uint64_t> *int_mem_ptr) {
|
||||
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
|
||||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
|
||||
remainder->num_radix_blocks != quotient->num_radix_blocks)
|
||||
@@ -518,12 +526,12 @@ __host__ void host_integer_div_rem_kb(
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count, positive_numerator, bsks, ksks,
|
||||
int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
|
||||
gpu_count, positive_divisor, bsks, ksks,
|
||||
int_mem_ptr->abs_mem_2, true);
|
||||
host_integer_abs_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, positive_numerator,
|
||||
bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, positive_divisor,
|
||||
bsks, ksks, ms_noise_reduction_key, int_mem_ptr->abs_mem_2, true);
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
@@ -532,7 +540,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient, remainder,
|
||||
positive_numerator, positive_divisor, bsks, ksks,
|
||||
int_mem_ptr->unsigned_mem);
|
||||
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
|
||||
|
||||
CudaRadixCiphertextFFI numerator_sign;
|
||||
as_radix_ciphertext_slice<Torus>(&numerator_sign, numerator, num_blocks - 1,
|
||||
@@ -543,7 +551,8 @@ __host__ void host_integer_div_rem_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->sign_bits_are_different, &numerator_sign, &divisor_sign,
|
||||
bsks, ksks, int_mem_ptr->compare_signed_bits_lut, 1,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->compare_signed_bits_lut, 1,
|
||||
int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
|
||||
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
@@ -558,10 +567,10 @@ __host__ void host_integer_div_rem_kb(
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
|
||||
gpu_count, int_mem_ptr->negated_quotient,
|
||||
nullptr, nullptr, int_mem_ptr->scp_mem_1,
|
||||
bsks, ksks, requested_flag, uses_carry);
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
|
||||
bsks, ksks, ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
@@ -571,17 +580,19 @@ __host__ void host_integer_div_rem_kb(
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
int_mem_ptr->negated_remainder, nullptr, nullptr,
|
||||
int_mem_ptr->scp_mem_2, bsks, ksks, requested_flag, uses_carry);
|
||||
int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
|
||||
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
|
||||
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
|
||||
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
|
||||
&numerator_sign, int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
|
||||
for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
@@ -590,7 +601,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
} else {
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
|
||||
divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
|
||||
divisor, bsks, ksks, ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,19 +2,18 @@
|
||||
#include "integer/negation.cuh"
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
void cuda_full_propagation_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *input_blocks, int8_t *mem_ptr, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *buffer =
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_full_propagate_inplace<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, input_blocks, buffer,
|
||||
(uint64_t **)(ksks), bsks, num_blocks);
|
||||
host_full_propagate_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, input_blocks, buffer,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_blocks);
|
||||
}
|
||||
|
||||
void scratch_cuda_full_propagation_64(
|
||||
@@ -23,11 +22,12 @@ void scratch_cuda_full_propagation_64(
|
||||
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_full_propagation<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count,
|
||||
@@ -51,12 +51,12 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory) {
|
||||
uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -71,12 +71,12 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory) {
|
||||
uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -91,12 +91,12 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -108,12 +108,14 @@ void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, carry_out,
|
||||
carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), requested_flag, uses_carry);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
@@ -121,12 +123,13 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_add_and_propagate_single_carry<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
|
||||
carry_out, carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), requested_flag, uses_carry);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
@@ -134,14 +137,15 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
(cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
|
||||
lhs_array, rhs_array, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
compute_overflow, uses_input_borrow);
|
||||
ms_noise_reduction_key, compute_overflow, uses_input_borrow);
|
||||
}
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(void *const *streams,
|
||||
@@ -177,12 +181,13 @@ void scratch_cuda_apply_univariate_lut_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory) {
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -198,12 +203,14 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory) {
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -216,12 +223,14 @@ void cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
bsks);
|
||||
ms_noise_reduction_key, bsks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
|
||||
@@ -236,13 +245,14 @@ void cuda_apply_many_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
host_apply_many_univariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
bsks, num_many_lut, lut_stride);
|
||||
ms_noise_reduction_key, bsks, num_many_lut, lut_stride);
|
||||
}
|
||||
|
||||
void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
@@ -252,12 +262,13 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory) {
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -271,13 +282,15 @@ void cuda_apply_bivariate_lut_kb_64(
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
|
||||
host_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe_1, input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
|
||||
(uint64_t **)(ksks), bsks, num_radix_blocks, shift);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_radix_blocks,
|
||||
shift);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
|
||||
@@ -295,12 +308,13 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory) {
|
||||
uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -313,12 +327,14 @@ void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks) {
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
generates_or_propagates, (int_radix_lut<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
|
||||
@@ -496,7 +496,9 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -544,10 +546,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
@@ -572,10 +574,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, grouping_factor, num_radix_blocks, pbs_type,
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -601,8 +603,9 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
@@ -646,10 +649,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
@@ -674,10 +677,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, grouping_factor, num_radix_blocks, pbs_type,
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_many_lut_lwe_async<Torus>(
|
||||
@@ -704,8 +707,9 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
|
||||
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
@@ -764,10 +768,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)(lwe_array_out->ptr),
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
@@ -788,10 +792,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, grouping_factor, num_radix_blocks, pbs_type,
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -1138,7 +1142,9 @@ void host_compute_shifted_blocks_and_states(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
|
||||
@@ -1147,7 +1153,8 @@ void host_compute_shifted_blocks_and_states(
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shifted_blocks_and_states, lwe_array,
|
||||
bsks, ksks, luts_array_first_step, num_many_lut, lut_stride);
|
||||
bsks, ksks, ms_noise_reduction_key, luts_array_first_step, num_many_lut,
|
||||
lut_stride);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks;
|
||||
auto block_states = mem->block_states;
|
||||
@@ -1166,7 +1173,9 @@ void host_resolve_group_carries_sequentially(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *resolved_carries,
|
||||
CudaRadixCiphertextFFI *grouping_pgns, int_radix_params params,
|
||||
int_seq_group_prop_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_groups) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_groups) {
|
||||
|
||||
auto group_resolved_carries = mem->group_resolved_carries;
|
||||
if (num_groups > 1) {
|
||||
@@ -1215,8 +1224,8 @@ void host_resolve_group_carries_sequentially(
|
||||
blocks_to_solve + 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_group_resolved_carries,
|
||||
&shifted_group_resolved_carries, bsks, ksks, luts_sequential,
|
||||
blocks_to_solve);
|
||||
&shifted_group_resolved_carries, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_sequential, blocks_to_solve);
|
||||
|
||||
// Copy the result to the resolved carries array
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -1234,7 +1243,9 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *step_output,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int_radix_lut<Torus> *luts,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (step_output->lwe_dimension != generates_or_propagates->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -1257,7 +1268,8 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &cur_blocks, &cur_blocks, prev_blocks,
|
||||
bsks, ksks, luts, cur_total_blocks, luts->params.message_modulus);
|
||||
bsks, ksks, ms_noise_reduction_key, luts, cur_total_blocks,
|
||||
luts->params.message_modulus);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], generates_or_propagates, space,
|
||||
@@ -1278,8 +1290,9 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *block_states,
|
||||
int_radix_params params, int_prop_simu_group_carries_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_groups) {
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_groups) {
|
||||
|
||||
if (num_radix_blocks > block_states->num_radix_blocks)
|
||||
PANIC("Cuda error: input does not have enough radix blocks")
|
||||
@@ -1296,8 +1309,8 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
auto luts_array_second_step = mem->luts_array_second_step;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, propagation_cum_sums,
|
||||
propagation_cum_sums, bsks, ksks, luts_array_second_step,
|
||||
num_radix_blocks);
|
||||
propagation_cum_sums, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_array_second_step, num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, propagation_cum_sums,
|
||||
@@ -1318,7 +1331,8 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
// Resolve group carries sequentially
|
||||
host_resolve_group_carries_sequentially(
|
||||
streams, gpu_indexes, gpu_count, resolved_carries, grouping_pgns,
|
||||
params, mem->seq_group_prop_mem, bsks, ksks, num_groups);
|
||||
params, mem->seq_group_prop_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
num_groups);
|
||||
} else {
|
||||
// Resolve group carries with hillis steele
|
||||
auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
|
||||
@@ -1327,7 +1341,8 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
resolved_carries, 1, num_groups);
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &shifted_resolved_carries,
|
||||
grouping_pgns, luts_carry_propagation_sum, bsks, ksks, num_groups - 1);
|
||||
grouping_pgns, luts_carry_propagation_sum, bsks, ksks,
|
||||
ms_noise_reduction_key, num_groups - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1342,7 +1357,9 @@ void host_compute_shifted_blocks_and_borrow_states(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
|
||||
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
|
||||
@@ -1350,7 +1367,8 @@ void host_compute_shifted_blocks_and_borrow_states(
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, shifted_blocks_and_borrow_states,
|
||||
lwe_array, bsks, ksks, luts_array_first_step, num_many_lut, lut_stride);
|
||||
lwe_array, bsks, ksks, ms_noise_reduction_key, luts_array_first_step,
|
||||
num_many_lut, lut_stride);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks;
|
||||
auto borrow_states = mem->borrow_states;
|
||||
@@ -1371,13 +1389,12 @@ void host_compute_shifted_blocks_and_borrow_states(
|
||||
* have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus>
|
||||
void host_full_propagate_inplace(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr,
|
||||
Torus *const *ksks, void *const *bsks,
|
||||
uint32_t num_blocks) {
|
||||
void host_full_propagate_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
auto params = mem_ptr->lut->params;
|
||||
|
||||
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
|
||||
@@ -1406,8 +1423,8 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
|
||||
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
|
||||
mem_ptr->lut->lut_indexes_vec,
|
||||
(Torus *)mem_ptr->tmp_small_lwe_vector->ptr,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
|
||||
params.glwe_dimension, params.small_lwe_dimension,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, ms_noise_reduction_key,
|
||||
mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension,
|
||||
params.polynomial_size, params.pbs_base_log, params.pbs_level,
|
||||
params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
@@ -1535,13 +1552,14 @@ __host__ void scalar_pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
* * (lwe_dimension+1) * sizeeof(Torus) bytes
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
extract_n_bits(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t effective_num_radix_blocks,
|
||||
uint32_t num_radix_blocks,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
__host__ void extract_n_bits(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t effective_num_radix_blocks, uint32_t num_radix_blocks,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
lwe_array_out, 0, num_radix_blocks,
|
||||
@@ -1555,17 +1573,19 @@ extract_n_bits(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
|
||||
bit_extract->lut, effective_num_radix_blocks);
|
||||
ms_noise_reduction_key, bit_extract->lut, effective_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *signs_array_out,
|
||||
CudaRadixCiphertextFFI *signs_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_sign_blocks) {
|
||||
__host__ void reduce_signs(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *signs_array_out,
|
||||
CudaRadixCiphertextFFI *signs_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_sign_blocks) {
|
||||
|
||||
if (signs_array_out->lwe_dimension != signs_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -1607,8 +1627,8 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
num_sign_blocks, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks, lut,
|
||||
num_sign_blocks / 2);
|
||||
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, num_sign_blocks / 2);
|
||||
|
||||
if (num_sign_blocks % 2 == 1)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -1637,7 +1657,7 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a, 2, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
|
||||
lut, 1);
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
|
||||
} else {
|
||||
|
||||
@@ -1655,7 +1675,7 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
|
||||
lut, 1);
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1679,17 +1699,17 @@ void scratch_cuda_apply_univariate_lut_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_univariate_lut_kb(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks) {
|
||||
void host_apply_univariate_lut_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
|
||||
mem, radix_lwe_out->num_radix_blocks);
|
||||
ms_noise_reduction_key, mem, radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -1717,12 +1737,13 @@ void host_apply_many_univariate_lut_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
|
||||
mem, num_many_lut, lut_stride);
|
||||
ms_noise_reduction_key, mem, num_many_lut, lut_stride);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -1745,19 +1766,19 @@ void scratch_cuda_apply_bivariate_lut_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_bivariate_lut_kb(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_1,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_2,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
void host_apply_bivariate_lut_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_1,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_2, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in_1,
|
||||
radix_lwe_in_2, bsks, ksks, mem, num_radix_blocks, shift);
|
||||
radix_lwe_in_2, bsks, ksks, ms_noise_reduction_key, mem, num_radix_blocks,
|
||||
shift);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -1774,15 +1795,14 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
// This function perform the three steps of Thomas' new carry propagation
|
||||
// includes the logic to extract overflow when requested
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries,
|
||||
int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
void host_propagate_single_carry(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -1806,7 +1826,7 @@ void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
|
||||
bsks, ksks, lut_stride, num_many_lut);
|
||||
bsks, ksks, ms_noise_reduction_key, lut_stride, num_many_lut);
|
||||
auto block_states = mem->shifted_blocks_state_mem->block_states;
|
||||
|
||||
if (requested_flag == outputFlag::FLAG_CARRY) {
|
||||
@@ -1817,8 +1837,8 @@ void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
// Step 2
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, gpu_indexes, gpu_count, block_states, params,
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
|
||||
mem->num_groups);
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, mem->num_groups);
|
||||
|
||||
auto group_size = mem->prop_simu_group_carries_mem->group_size;
|
||||
|
||||
@@ -1856,7 +1876,8 @@ void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
num_radix_blocks + 1, &output_flag, 0, 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
|
||||
bsks, ksks, mem->lut_message_extract, num_radix_blocks + 1);
|
||||
bsks, ksks, ms_noise_reduction_key, mem->lut_message_extract,
|
||||
num_radix_blocks + 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_array, 0, num_radix_blocks,
|
||||
@@ -1868,7 +1889,7 @@ void host_propagate_single_carry(cudaStream_t const *streams,
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
|
||||
message_extract, num_radix_blocks);
|
||||
ms_noise_reduction_key, message_extract, num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1880,8 +1901,9 @@ void host_add_and_propagate_single_carry(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t requested_flag,
|
||||
uint32_t uses_carry) {
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
|
||||
@@ -1924,13 +1946,13 @@ void host_add_and_propagate_single_carry(
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lhs_array, mem->shifted_blocks_state_mem,
|
||||
bsks, ksks, lut_stride, num_many_lut);
|
||||
bsks, ksks, ms_noise_reduction_key, lut_stride, num_many_lut);
|
||||
auto block_states = mem->shifted_blocks_state_mem->block_states;
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
|
||||
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &output_flag, mem->last_lhs,
|
||||
mem->last_rhs, bsks, ksks, lut_overflow_prep, 1,
|
||||
mem->last_rhs, bsks, ksks, ms_noise_reduction_key, lut_overflow_prep, 1,
|
||||
lut_overflow_prep->params.message_modulus);
|
||||
} else if (requested_flag == outputFlag::FLAG_CARRY) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -1941,8 +1963,8 @@ void host_add_and_propagate_single_carry(
|
||||
// Step 2
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, gpu_indexes, gpu_count, block_states, params,
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
|
||||
mem->num_groups);
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, mem->num_groups);
|
||||
|
||||
auto group_size = mem->prop_simu_group_carries_mem->group_size;
|
||||
|
||||
@@ -1990,7 +2012,8 @@ void host_add_and_propagate_single_carry(
|
||||
num_radix_blocks + 1, &output_flag, 0, 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
|
||||
bsks, ksks, mem->lut_message_extract, num_radix_blocks + 1);
|
||||
bsks, ksks, ms_noise_reduction_key, mem->lut_message_extract,
|
||||
num_radix_blocks + 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], lhs_array, 0, num_radix_blocks,
|
||||
@@ -2001,7 +2024,7 @@ void host_add_and_propagate_single_carry(
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
|
||||
mem->lut_message_extract, num_radix_blocks);
|
||||
ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2026,6 +2049,7 @@ void host_single_borrow_propagate(
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow,
|
||||
int_borrow_prop_memory<Torus> *mem, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_groups, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
@@ -2049,8 +2073,8 @@ void host_single_borrow_propagate(
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_borrow_states<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array,
|
||||
mem->shifted_blocks_borrow_state_mem, bsks, ksks, lut_stride,
|
||||
num_many_lut);
|
||||
mem->shifted_blocks_borrow_state_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_stride, num_many_lut);
|
||||
|
||||
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -2060,8 +2084,8 @@ void host_single_borrow_propagate(
|
||||
// Step 2
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, gpu_indexes, gpu_count, borrow_states, params,
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, num_radix_blocks,
|
||||
num_groups);
|
||||
mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_groups);
|
||||
|
||||
auto shifted_blocks =
|
||||
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
|
||||
@@ -2109,7 +2133,8 @@ void host_single_borrow_propagate(
|
||||
auto borrow_flag = mem->lut_borrow_flag;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
|
||||
mem->overflow_block, bsks, ksks, borrow_flag, 1);
|
||||
mem->overflow_block, bsks, ksks, ms_noise_reduction_key, borrow_flag,
|
||||
1);
|
||||
}
|
||||
for (int j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
|
||||
@@ -2129,7 +2154,7 @@ void host_single_borrow_propagate(
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_2, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
|
||||
bsks, ksks, message_extract, num_radix_blocks);
|
||||
bsks, ksks, ms_noise_reduction_key, message_extract, num_radix_blocks);
|
||||
|
||||
for (int j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j],
|
||||
|
||||
@@ -72,12 +72,13 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
|
||||
uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
|
||||
uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
polynomial_size * glwe_dimension, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -130,51 +131,59 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
|
||||
uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, is_bool_left, radix_lwe_right, is_bool_right, bsks,
|
||||
(uint64_t **)(ksks), (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
@@ -199,12 +208,13 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
@@ -215,7 +225,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
@@ -230,7 +241,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
case 512:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
@@ -239,7 +250,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
@@ -248,7 +259,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
@@ -257,7 +268,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
@@ -266,7 +277,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
@@ -275,7 +286,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
|
||||
AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), mem,
|
||||
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
|
||||
nullptr);
|
||||
|
||||
@@ -184,6 +184,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
|
||||
int_radix_lut<Torus> *reused_lut) {
|
||||
@@ -375,8 +376,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
@@ -422,11 +423,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
|
||||
@@ -471,8 +472,9 @@ __host__ void host_integer_mult_radix_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, uint64_t *const *ksks, int_mul_memory<Torus> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
|
||||
if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension ||
|
||||
radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension)
|
||||
@@ -492,14 +494,16 @@ __host__ void host_integer_mult_radix_kb(
|
||||
if (is_bool_right) {
|
||||
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_left, radix_lwe_right, mem_ptr->zero_out_mem,
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks,
|
||||
ms_noise_reduction_key, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bool_left) {
|
||||
zero_out_if<Torus>(streams, gpu_indexes, gpu_count, radix_lwe_out,
|
||||
radix_lwe_right, radix_lwe_left, mem_ptr->zero_out_mem,
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks, num_blocks);
|
||||
mem_ptr->zero_out_predicate_lut, bsks, ksks,
|
||||
ms_noise_reduction_key, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -573,8 +577,8 @@ __host__ void host_integer_mult_radix_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
|
||||
vector_result_sb, bsks, ksks, luts_array, total_block_count,
|
||||
luts_array->params.message_modulus);
|
||||
vector_result_sb, bsks, ksks, ms_noise_reduction_key, luts_array,
|
||||
total_block_count, luts_array->params.message_modulus);
|
||||
|
||||
vector_result_lsb = block_mul_res;
|
||||
as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
|
||||
@@ -602,8 +606,8 @@ __host__ void host_integer_mult_radix_kb(
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
|
||||
ksks, mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks,
|
||||
mem_ptr->luts_array);
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks, mem_ptr->luts_array);
|
||||
|
||||
uint32_t block_modulus = message_modulus * carry_modulus;
|
||||
uint32_t num_bits_in_block = log2_int(block_modulus);
|
||||
@@ -613,7 +617,8 @@ __host__ void host_integer_mult_radix_kb(
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, requested_flag, uses_carry);
|
||||
scp_mem_ptr, bsks, ksks, ms_noise_reduction_key, requested_flag,
|
||||
uses_carry);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -114,10 +114,12 @@ template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_overflowing_sub_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
*mem_ptr = new int_overflowing_sub_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
|
||||
allocate_ms_array);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -129,7 +131,9 @@ __host__ void host_integer_overflowing_sub(
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow,
|
||||
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
|
||||
if (output->num_radix_blocks != input_left->num_radix_blocks ||
|
||||
output->num_radix_blocks != input_right->num_radix_blocks)
|
||||
@@ -160,7 +164,7 @@ __host__ void host_integer_overflowing_sub(
|
||||
host_single_borrow_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
|
||||
num_groups, compute_overflow, uses_input_borrow);
|
||||
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -5,13 +5,15 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
|
||||
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void update_degrees_after_scalar_bitand(uint64_t *output_degrees,
|
||||
|
||||
@@ -10,7 +10,8 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
|
||||
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
if (output->num_radix_blocks != input->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be equal")
|
||||
@@ -49,8 +50,8 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
lut->broadcast_lut(streams, gpu_indexes, 0);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, input, bsks, ksks, lut,
|
||||
num_clear_blocks);
|
||||
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, num_clear_blocks);
|
||||
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
|
||||
|
||||
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
|
||||
|
||||
@@ -36,7 +36,9 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, uint32_t num_scalar_blocks) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_scalar_blocks) {
|
||||
|
||||
// The output ciphertext might be a boolean block or a radix ciphertext
|
||||
// depending on the case (eq/gt vs max/min) so the amount of blocks to
|
||||
@@ -50,7 +52,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks), buffer,
|
||||
bsks, (uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -64,7 +67,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
@@ -75,7 +78,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_in, static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
|
||||
@@ -30,7 +30,9 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
@@ -72,7 +74,7 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
|
||||
ksks, sign_lut, num_radix_blocks);
|
||||
ksks, ms_noise_reduction_key, sign_lut, num_radix_blocks);
|
||||
|
||||
// FIXME: without this sync signed scalar eq tests fail, I don't understand
|
||||
// the reason
|
||||
@@ -92,14 +94,16 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks should not be lower "
|
||||
"than the number of blocks to operate on")
|
||||
|
||||
bool allocate_ms_array = ms_noise_reduction_key->num_zeros != 0;
|
||||
auto params = mem_ptr->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -133,11 +137,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// means scalar is zero
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsks, ksks, num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
lwe_array_in, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
@@ -155,7 +159,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
|
||||
mem_ptr->tmp_lwe_array_out, bsks, ksks, ms_noise_reduction_key, lut, 1);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -207,10 +211,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, comparisons,
|
||||
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
@@ -218,15 +222,16 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out, &msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
bsks, ksks, ms_noise_reduction_key, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out,
|
||||
&lwe_array_msb_out, mem_ptr, bsks, ksks,
|
||||
&lwe_array_msb_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
lwe_array_msb_out.num_radix_blocks);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
@@ -253,7 +258,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
&lwe_array_msb_out, bsks, ksks, lut, 1, lut->params.message_modulus);
|
||||
&lwe_array_msb_out, bsks, ksks, ms_noise_reduction_key, lut, 1,
|
||||
lut->params.message_modulus);
|
||||
|
||||
} else {
|
||||
if (num_radix_blocks == 1) {
|
||||
@@ -285,7 +291,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
ksks, one_block_lut, 1);
|
||||
ksks, ms_noise_reduction_key, one_block_lut, 1);
|
||||
one_block_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete one_block_lut;
|
||||
} else {
|
||||
@@ -314,7 +320,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparisons, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
@@ -322,7 +329,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -334,14 +341,16 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks should not be lower "
|
||||
"than the number of blocks to operate on")
|
||||
|
||||
bool allocate_ms_array = ms_noise_reduction_key->num_zeros != 0;
|
||||
auto params = mem_ptr->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
@@ -376,10 +385,12 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, are_all_msb_zeros, are_all_msb_zeros,
|
||||
mem_ptr, bsks, ksks, are_all_msb_zeros->num_radix_blocks);
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
are_all_msb_zeros->num_radix_blocks);
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(&sign_block, lwe_array_in,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
@@ -428,7 +439,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
&sign_block, bsks, ksks, lut, 1, lut->params.message_modulus);
|
||||
&sign_block, bsks, ksks, ms_noise_reduction_key, lut, 1,
|
||||
lut->params.message_modulus);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -474,10 +486,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
|
||||
comparisons, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, comparisons,
|
||||
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
@@ -485,17 +497,18 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
tree_sign_reduction<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
auto are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &are_all_msb_zeros, &msb, mem_ptr,
|
||||
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
bsks, ksks, ms_noise_reduction_key, num_msb_radix_blocks,
|
||||
mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &are_all_msb_zeros,
|
||||
&are_all_msb_zeros, mem_ptr, bsks, ksks,
|
||||
&are_all_msb_zeros, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
are_all_msb_zeros.num_radix_blocks);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
@@ -535,8 +548,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_msb_out, &sign_block,
|
||||
&are_all_msb_zeros, bsks, ksks, signed_msb_lut, 1,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
&are_all_msb_zeros, bsks, ksks, ms_noise_reduction_key, signed_msb_lut,
|
||||
1, signed_msb_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
@@ -546,7 +559,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// Reduce the two blocks into one final
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
2);
|
||||
ms_noise_reduction_key, 2);
|
||||
|
||||
} else {
|
||||
if (num_radix_blocks == 1) {
|
||||
@@ -580,7 +593,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
ksks, one_block_lut, 1);
|
||||
ksks, ms_noise_reduction_key, one_block_lut, 1);
|
||||
one_block_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete one_block_lut;
|
||||
} else {
|
||||
@@ -621,7 +634,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, lwe_array_ct_out,
|
||||
diff_buffer->tmp_packed, (Torus *)rhs.ptr, mem_ptr, bsks, ksks,
|
||||
num_lsb_radix_blocks);
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
CudaRadixCiphertextFFI encrypted_sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(&encrypted_sign_block, lwe_array_in,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
@@ -637,7 +650,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &lwe_array_sign_out,
|
||||
&encrypted_sign_block, trivial_sign_block, bsks, ksks,
|
||||
mem_ptr->signed_lut, 1, mem_ptr->signed_lut->params.message_modulus);
|
||||
ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
|
||||
@@ -648,7 +662,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// final sign
|
||||
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_lsb_radix_blocks + 1);
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -660,7 +674,9 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -673,12 +689,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, h_scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
|
||||
scalar_blocks, h_scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -688,8 +704,9 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks) {
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -708,7 +725,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
host_integer_radix_scalar_difference_check_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
|
||||
h_scalar_blocks, mem_ptr, mem_ptr->identity_lut_f, bsks, ksks,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
|
||||
// There is no optimized CMUX for scalars, so we convert to a trivial
|
||||
// ciphertext
|
||||
@@ -722,10 +739,10 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
|
||||
lwe_array_out, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr->cmux_buffer, bsks, ksks, ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -734,7 +751,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -804,8 +823,8 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
mem_ptr->tmp_packed_input, bsks, ksks, scalar_comparison_luts,
|
||||
num_halved_lsb_radix_blocks);
|
||||
mem_ptr->tmp_packed_input, bsks, ksks, ms_noise_reduction_key,
|
||||
scalar_comparison_luts, num_halved_lsb_radix_blocks);
|
||||
}
|
||||
//////////////
|
||||
// msb_in
|
||||
@@ -822,12 +841,12 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_blocks_with_zero<Torus>(msb_streams, gpu_indexes, gpu_count,
|
||||
&msb_out, &msb_in, mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, msb_lut);
|
||||
are_all_comparisons_block_true<Torus>(msb_streams, gpu_indexes, gpu_count,
|
||||
&msb_out, &msb_out, mem_ptr, bsks,
|
||||
ksks, msb_out.num_radix_blocks);
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &msb_out, &msb_in, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, gpu_indexes, gpu_count, &msb_out, &msb_out, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, msb_out.num_radix_blocks);
|
||||
}
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
@@ -839,13 +858,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -6,12 +6,13 @@ void scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus);
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -23,8 +24,9 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
|
||||
uint64_t const *has_at_least_one_set, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus,
|
||||
uint32_t num_scalars) {
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
@@ -32,42 +34,48 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
|
||||
decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
|
||||
|
||||
@@ -45,6 +45,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
T const *decomposed_scalar, T const *has_at_least_one_set,
|
||||
int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t message_modulus, uint32_t num_scalars) {
|
||||
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
@@ -67,7 +68,8 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
lwe_array, 0, num_radix_blocks);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
|
||||
streams, gpu_indexes, gpu_count, &shift_input, shift_amount,
|
||||
mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
|
||||
mem->logical_scalar_shift_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
} else {
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<T>(
|
||||
@@ -115,14 +117,16 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
|
||||
ksks, mem->sum_ciphertexts_vec_mem, num_radix_blocks, j, nullptr);
|
||||
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
|
||||
num_radix_blocks, j, nullptr);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
|
||||
nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
requested_flag, uses_carry);
|
||||
ms_noise_reduction_key, requested_flag,
|
||||
uses_carry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -23,12 +23,13 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, n,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
|
||||
|
||||
@@ -28,7 +28,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto num_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -75,8 +76,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
|
||||
lut_bivariate->params.message_modulus);
|
||||
giver_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
|
||||
num_blocks, lut_bivariate->params.message_modulus);
|
||||
|
||||
} else {
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
@@ -102,8 +103,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
|
||||
giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
|
||||
lut_bivariate->params.message_modulus);
|
||||
giver_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
|
||||
num_blocks, lut_bivariate->params.message_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -27,12 +27,13 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), lwe_array->num_radix_blocks);
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, lwe_array->num_radix_blocks);
|
||||
}
|
||||
|
||||
void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
@@ -42,12 +43,12 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -65,12 +66,13 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, shift,
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
|
||||
@@ -28,7 +28,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
if (lwe_array->num_radix_blocks < num_blocks)
|
||||
PANIC("Cuda error: input does not have enough blocks")
|
||||
@@ -81,7 +83,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, &partial_current_blocks,
|
||||
&partial_current_blocks, &partial_previous_blocks, bsks, ksks,
|
||||
lut_bivariate, partial_block_count,
|
||||
ms_noise_reduction_key, lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
|
||||
} else {
|
||||
@@ -113,8 +115,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, &partial_next_blocks, bsks, ksks, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
partial_current_blocks, &partial_next_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +138,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
auto num_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -206,7 +210,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, partial_current_blocks,
|
||||
partial_current_blocks, &partial_next_blocks, bsks, ksks,
|
||||
lut_bivariate, partial_block_count,
|
||||
ms_noise_reduction_key, lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
@@ -218,7 +222,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_1, gpu_indexes, gpu_count, &padding_block,
|
||||
&last_block_copy, bsks, ksks, lut_univariate_padding_block, 1);
|
||||
&last_block_copy, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_univariate_padding_block, 1);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
// block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
@@ -232,7 +237,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_2, gpu_indexes, gpu_count, &last_block,
|
||||
&last_block_copy, bsks, ksks, lut_univariate_shift_last_block, 1);
|
||||
&last_block_copy, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_univariate_shift_last_block, 1);
|
||||
}
|
||||
for (uint j = 0; j < mem->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
|
||||
|
||||
@@ -7,12 +7,12 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool is_signed, bool allocate_gpu_memory) {
|
||||
bool is_signed, bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus);
|
||||
message_modulus, carry_modulus, allocate_ms_array);
|
||||
|
||||
scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
@@ -23,12 +23,13 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, lwe_shift,
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
|
||||
|
||||
@@ -29,7 +29,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
|
||||
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
|
||||
@@ -56,8 +57,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// Extract all bits
|
||||
auto bits = mem->tmp_bits;
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
|
||||
ksks, num_radix_blocks * bits_per_block,
|
||||
num_radix_blocks, mem->bit_extract_luts);
|
||||
ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks * bits_per_block, num_radix_blocks,
|
||||
mem->bit_extract_luts);
|
||||
|
||||
// Extract shift bits
|
||||
auto shift_bits = mem->tmp_shift_bits;
|
||||
@@ -77,8 +79,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// so that it is already aligned to the correct position of the cmux input
|
||||
// and we reduce noise growth
|
||||
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
|
||||
bsks, ksks, max_num_bits_that_tell_shift,
|
||||
num_radix_blocks, mem->bit_extract_luts_with_offset_2);
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
max_num_bits_that_tell_shift, num_radix_blocks,
|
||||
mem->bit_extract_luts_with_offset_2);
|
||||
|
||||
// If signed, do an "arithmetic shift" by padding with the sign bit
|
||||
CudaRadixCiphertextFFI last_bit;
|
||||
@@ -164,7 +167,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
|
||||
mux_lut, total_nb_bits);
|
||||
ms_noise_reduction_key, mux_lut, total_nb_bits);
|
||||
}
|
||||
|
||||
// Initializes the output
|
||||
@@ -195,7 +198,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
|
||||
cleaning_lut, num_radix_blocks);
|
||||
ms_noise_reduction_key, cleaning_lut, num_radix_blocks);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
|
||||
|
||||
#include "bootstrapping_key.cuh"
|
||||
#include "ciphertext.h"
|
||||
#include "cooperative_groups.h"
|
||||
#include "device.h"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
@@ -78,21 +79,21 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const std::vector<Torus *> lut_vec,
|
||||
const std::vector<Torus *> lut_indexes_vec,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
void *const *bootstrapping_keys,
|
||||
std::vector<int8_t *> pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
void execute_pbs_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const std::vector<Torus *> lut_vec,
|
||||
const std::vector<Torus *> lut_indexes_vec,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
void *const *bootstrapping_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
@@ -192,9 +193,9 @@ void execute_pbs_async(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu,
|
||||
num_many_lut, lut_stride);
|
||||
bootstrapping_keys[i], ms_noise_reduction_key, pbs_buffer[i],
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -213,7 +214,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
@@ -222,8 +223,9 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
scratch_cuda_programmable_bootstrap_32(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
@@ -241,8 +243,9 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
|
||||
break;
|
||||
case CLASSICAL:
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
|
||||
@@ -190,9 +190,10 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
|
||||
@@ -219,8 +220,9 @@ __host__ void scratch_programmable_bootstrap_cg(
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#if (CUDA_ARCH >= 900)
|
||||
#include "programmable_bootstrap_tbc_classic.cuh"
|
||||
#endif
|
||||
#include "ciphertext.h"
|
||||
|
||||
template <typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
@@ -70,51 +71,52 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -202,51 +204,52 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -258,51 +261,52 @@ void scratch_cuda_programmable_bootstrap_cg(
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -318,9 +322,10 @@ void scratch_cuda_programmable_bootstrap(
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_32(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
#if (CUDA_ARCH >= 900)
|
||||
@@ -329,8 +334,8 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
@@ -338,13 +343,13 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
else
|
||||
scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -353,9 +358,10 @@ void scratch_cuda_programmable_bootstrap_32(
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
#if (CUDA_ARCH >= 900)
|
||||
@@ -364,8 +370,8 @@ void scratch_cuda_programmable_bootstrap_64(
|
||||
level_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
@@ -373,13 +379,13 @@ void scratch_cuda_programmable_bootstrap_64(
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
else
|
||||
scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -661,6 +667,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
@@ -672,6 +679,29 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key != nullptr) {
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(ms_noise_reduction_key->ptr[gpu_index]),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
|
||||
}
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if (CUDA_ARCH >= 900)
|
||||
@@ -680,7 +710,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
@@ -695,7 +725,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
@@ -707,7 +737,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
@@ -754,14 +784,16 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
@@ -785,14 +817,16 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -822,12 +856,14 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t lut_stride);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
#endif
|
||||
|
||||
@@ -302,9 +302,10 @@ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
|
||||
@@ -380,8 +381,9 @@ __host__ void scratch_programmable_bootstrap(
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
template <typename Torus, class params, bool first_iter>
|
||||
|
||||
@@ -7,40 +7,41 @@
|
||||
*/
|
||||
void scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -52,7 +53,7 @@ void scratch_cuda_programmable_bootstrap_128(
|
||||
template <typename Torus>
|
||||
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lut_vector, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector, Torus *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
@@ -151,18 +152,38 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<__uint128_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<__uint128_t const *>(lwe_array_in),
|
||||
static_cast<const __uint128_t *>(
|
||||
ms_noise_reduction_key->ptr[gpu_index]),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in = const_cast<__uint128_t *>(
|
||||
static_cast<const __uint128_t *>(lwe_array_in));
|
||||
}
|
||||
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const __uint128_t *>(lwe_array_in),
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
@@ -247,8 +247,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
template <typename params>
|
||||
__host__ void scratch_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
uint64_t full_sm_step_one =
|
||||
@@ -342,14 +343,15 @@ __host__ void scratch_programmable_bootstrap_128(
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer_128<CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
template <class params, bool first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
__uint128_t const *lwe_array_in, double const *bootstrapping_key,
|
||||
__uint128_t *lwe_array_in, double const *bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -432,7 +434,7 @@ __host__ void execute_step_two_128(
|
||||
template <class params>
|
||||
__host__ void host_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, __uint128_t const *lwe_array_in,
|
||||
__uint128_t const *lut_vector, __uint128_t *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
|
||||
@@ -197,9 +197,10 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
template <typename Torus, typename params>
|
||||
__host__ void scratch_programmable_bootstrap_tbc(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
@@ -247,8 +248,9 @@ __host__ void scratch_programmable_bootstrap_tbc(
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory);
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -263,8 +263,8 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
|
||||
scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
true);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count, true, false);
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
@@ -295,8 +295,8 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
|
||||
|
||||
scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
true);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count, true, false);
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
@@ -320,8 +320,8 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
|
||||
|
||||
scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
true);
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count, true, false);
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
|
||||
@@ -165,9 +165,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
|
||||
|
||||
TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
int8_t *pbs_buffer;
|
||||
scratch_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer,
|
||||
glwe_dimension, polynomial_size,
|
||||
pbs_level, number_of_inputs, true);
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, pbs_level, number_of_inputs, true, false);
|
||||
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
@@ -190,9 +190,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
|
||||
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
|
||||
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, number_of_inputs, num_many_lut, lut_stride);
|
||||
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, nullptr,
|
||||
pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, number_of_inputs, num_many_lut, lut_stride);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
|
||||
@@ -35,9 +35,62 @@ unsafe extern "C" {
|
||||
polynomial_size: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_modulus_switch_inplace_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
size: u32,
|
||||
log_modulus: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_improve_noise_modulus_switch_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
encrypted_zeros: *const ffi::c_void,
|
||||
lwe_size: u32,
|
||||
num_lwes: u32,
|
||||
num_zeros: u32,
|
||||
input_variance: f64,
|
||||
r_sigma: f64,
|
||||
bound: f64,
|
||||
log_modulus: u32,
|
||||
);
|
||||
}
|
||||
pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
|
||||
pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
|
||||
pub type PBS_TYPE = ffi::c_uint;
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaModulusSwitchNoiseReductionKeyFFI {
|
||||
pub ptr: *const *mut ffi::c_void,
|
||||
pub num_zeros: u32,
|
||||
pub ms_bound: f64,
|
||||
pub ms_r_sigma: f64,
|
||||
pub ms_input_variance: f64,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaModulusSwitchNoiseReductionKeyFFI"]
|
||||
[::std::mem::size_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 40usize];
|
||||
["Alignment of CudaModulusSwitchNoiseReductionKeyFFI"]
|
||||
[::std::mem::align_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 8usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::num_zeros"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, num_zeros) - 8usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_bound"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_bound) - 16usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_r_sigma"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_r_sigma) - 24usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_input_variance"][::std::mem::offset_of!(
|
||||
CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
ms_input_variance
|
||||
) - 32usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
@@ -78,6 +131,7 @@ unsafe extern "C" {
|
||||
storage_log_modulus: u32,
|
||||
body_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -191,6 +245,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
lut_degree: u64,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -215,6 +270,7 @@ unsafe extern "C" {
|
||||
num_many_lut: u32,
|
||||
lut_degree: u64,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -226,6 +282,7 @@ unsafe extern "C" {
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
@@ -258,6 +315,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
lut_degree: u64,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -270,6 +328,7 @@ unsafe extern "C" {
|
||||
input_radix_lwe_2: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_radix_blocks: u32,
|
||||
shift: u32,
|
||||
@@ -292,6 +351,7 @@ unsafe extern "C" {
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_luts: u32,
|
||||
lut_stride: u32,
|
||||
@@ -315,6 +375,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -325,6 +386,7 @@ unsafe extern "C" {
|
||||
input_blocks: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
@@ -358,6 +420,7 @@ unsafe extern "C" {
|
||||
num_blocks: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -372,6 +435,7 @@ unsafe extern "C" {
|
||||
is_bool_right: bool,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
mem_ptr: *mut i8,
|
||||
polynomial_size: u32,
|
||||
num_blocks: u32,
|
||||
@@ -431,6 +495,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
shift_type: SHIFT_OR_ROTATE_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -443,6 +508,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -466,6 +532,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
shift_type: SHIFT_OR_ROTATE_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -478,6 +545,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -518,6 +586,7 @@ unsafe extern "C" {
|
||||
shift_type: SHIFT_OR_ROTATE_TYPE,
|
||||
is_signed: bool,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -530,6 +599,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -562,6 +632,7 @@ unsafe extern "C" {
|
||||
op_type: COMPARISON_TYPE,
|
||||
is_signed: bool,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -575,6 +646,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -589,6 +661,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
num_scalar_blocks: u32,
|
||||
);
|
||||
}
|
||||
@@ -621,6 +694,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
op_type: BITOP_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -634,6 +708,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -649,6 +724,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -679,6 +755,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -693,6 +770,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -724,6 +802,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
shift_type: SHIFT_OR_ROTATE_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -736,6 +815,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -768,6 +848,7 @@ unsafe extern "C" {
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -792,6 +873,7 @@ unsafe extern "C" {
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -805,6 +887,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
);
|
||||
@@ -821,6 +904,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
);
|
||||
@@ -862,6 +946,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
compute_overflow: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -876,6 +961,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
compute_overflow: u32,
|
||||
uses_input_borrow: u32,
|
||||
);
|
||||
@@ -908,6 +994,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -920,6 +1007,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -949,6 +1037,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -962,6 +1051,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
polynomial_size: u32,
|
||||
message_modulus: u32,
|
||||
num_scalars: u32,
|
||||
@@ -996,6 +1086,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1011,6 +1102,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1042,6 +1134,7 @@ unsafe extern "C" {
|
||||
pbs_type: PBS_TYPE,
|
||||
lut_degree: u64,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1053,6 +1146,7 @@ unsafe extern "C" {
|
||||
generates_or_propagates: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
@@ -1094,6 +1188,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1106,6 +1201,7 @@ unsafe extern "C" {
|
||||
is_signed: bool,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1136,6 +1232,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1148,6 +1245,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
num_radix_blocks: u32,
|
||||
);
|
||||
}
|
||||
@@ -1179,6 +1277,7 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1191,6 +1290,7 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
num_radix_blocks: u32,
|
||||
);
|
||||
}
|
||||
@@ -1522,11 +1622,13 @@ unsafe extern "C" {
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1534,11 +1636,13 @@ unsafe extern "C" {
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1546,11 +1650,13 @@ unsafe extern "C" {
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1586,6 +1692,7 @@ unsafe extern "C" {
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
@@ -1605,6 +1712,7 @@ unsafe extern "C" {
|
||||
lut_vector: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
|
||||
@@ -89,7 +89,7 @@ shortint = ["dep:sha3"]
|
||||
integer = ["shortint"]
|
||||
strings = ["integer"]
|
||||
internal-keycache = ["dep:fs2"]
|
||||
gpu = ["dep:tfhe-cuda-backend"]
|
||||
gpu = ["dep:tfhe-cuda-backend","shortint"]
|
||||
zk-pok = ["dep:tfhe-zk-pok"]
|
||||
|
||||
# Adds more FheUint/FheInt types to the HL
|
||||
|
||||
@@ -451,7 +451,8 @@ mod cuda {
|
||||
params.lwe_dimension.unwrap(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
|
||||
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
|
||||
// Allocate a new LweCiphertext and encrypt our plaintext
|
||||
let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
|
||||
@@ -224,7 +224,7 @@ mod cuda {
|
||||
lwe_dimension,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
|
||||
let message_modulus: Scalar = 1 << 4;
|
||||
|
||||
|
||||
@@ -858,7 +858,8 @@ mod cuda {
|
||||
params.lwe_dimension.unwrap(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
|
||||
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
|
||||
// Allocate a new LweCiphertext and encrypt our plaintext
|
||||
let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
@@ -1109,7 +1110,8 @@ mod cuda {
|
||||
params.lwe_dimension.unwrap(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
|
||||
|
||||
let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
|
||||
const NUM_CTS: usize = 8192;
|
||||
let plaintext_list = PlaintextList::new(Scalar::ZERO, PlaintextCount(NUM_CTS));
|
||||
|
||||
@@ -127,6 +127,9 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
lut_indexes.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let lwe_dimension = input.lwe_dimension();
|
||||
let ct_modulus = input.ciphertext_modulus().raw_modulus_float();
|
||||
|
||||
programmable_bootstrap_async(
|
||||
streams,
|
||||
&mut output.0.d_vec,
|
||||
@@ -136,12 +139,14 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
&input.0.d_vec,
|
||||
input_indexes,
|
||||
&bsk.d_vec,
|
||||
input.lwe_dimension(),
|
||||
lwe_dimension,
|
||||
bsk.glwe_dimension(),
|
||||
bsk.polynomial_size(),
|
||||
bsk.decomp_base_log(),
|
||||
bsk.decomp_level_count(),
|
||||
num_samples.0 as u32,
|
||||
bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -256,19 +261,22 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
|
||||
streams.gpu_indexes[0].get(),
|
||||
accumulator.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let lwe_dimension = input.lwe_dimension();
|
||||
let ct_modulus = input.ciphertext_modulus().raw_modulus_float();
|
||||
programmable_bootstrap_128_async(
|
||||
streams,
|
||||
&mut output.0.d_vec,
|
||||
&accumulator.0.d_vec,
|
||||
&input.0.d_vec,
|
||||
&bsk.d_vec,
|
||||
input.lwe_dimension(),
|
||||
lwe_dimension,
|
||||
bsk.glwe_dimension(),
|
||||
bsk.polynomial_size(),
|
||||
bsk.decomp_base_log(),
|
||||
bsk.decomp_level_count(),
|
||||
num_samples.0 as u32,
|
||||
bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ fn lwe_encrypt_pbs_decrypt<
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
|
||||
@@ -78,7 +78,7 @@ where
|
||||
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, &stream);
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, None, &stream);
|
||||
|
||||
// Our 4 bits message space
|
||||
let message_modulus: Scalar = Scalar::ONE << 4;
|
||||
|
||||
@@ -8,8 +8,8 @@ mod lwe_multi_bit_programmable_bootstrapping;
|
||||
mod lwe_packing_keyswitch;
|
||||
mod lwe_programmable_bootstrapping;
|
||||
mod lwe_programmable_bootstrapping_128;
|
||||
mod modulus_switch_noise_reduction;
|
||||
mod noise_distribution;
|
||||
|
||||
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
|
||||
pub lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
use super::super::test::TestResources;
|
||||
use super::*;
|
||||
use crate::core_crypto::commons::test_tools::{check_both_ratio_under, mean, variance};
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::prelude::{
|
||||
allocate_and_encrypt_new_lwe_ciphertext, decrypt_lwe_ciphertext, encrypt_lwe_ciphertext_list,
|
||||
LweCiphertextCount, LweCiphertextList, LweSecretKey, LweSecretKeyOwned, Plaintext,
|
||||
PlaintextCount, PlaintextList, Variance,
|
||||
};
|
||||
|
||||
use crate::core_crypto::gpu::GpuIndex;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use std::cell::RefCell;
|
||||
use tfhe_cuda_backend::bindings::{
|
||||
cuda_improve_noise_modulus_switch_64, cuda_modulus_switch_inplace_64,
|
||||
};
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct MsNoiseReductionTestParams {
|
||||
pub lwe_dimension: LweDimension,
|
||||
pub lwe_noise_distribution: DynamicDistribution<u64>,
|
||||
pub ciphertext_modulus: CiphertextModulus<u64>,
|
||||
pub modulus_switch_zeros_count: LweCiphertextCount,
|
||||
pub bound: NoiseEstimationMeasureBound,
|
||||
pub r_sigma_factor: RSigmaFactor,
|
||||
pub input_variance: Variance,
|
||||
pub log_modulus: CiphertextModulusLog,
|
||||
pub expected_variance_improved: Variance,
|
||||
}
|
||||
|
||||
const TEST_PARAM: MsNoiseReductionTestParams = MsNoiseReductionTestParams {
|
||||
lwe_dimension: LweDimension(918),
|
||||
lwe_noise_distribution: DynamicDistribution::new_t_uniform(45),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
modulus_switch_zeros_count: LweCiphertextCount(1449),
|
||||
bound: NoiseEstimationMeasureBound(288230376151711744_f64),
|
||||
r_sigma_factor: RSigmaFactor(13.179852282053789f64),
|
||||
log_modulus: PolynomialSize(2048).to_blind_rotation_input_modulus_log(),
|
||||
expected_variance_improved: Variance(1.40546154228955e-6),
|
||||
input_variance: Variance(2.63039184094559e-7f64),
|
||||
};
|
||||
|
||||
thread_local! {
|
||||
static TEST_RESOURCES: RefCell<TestResources> = {
|
||||
RefCell::new(TestResources::new())
|
||||
}
|
||||
}
|
||||
|
||||
fn round_mask_gpu(
|
||||
ct: &mut LweCiphertext<Vec<u64>>,
|
||||
d_ct: &mut CudaLweCiphertextList<u64>,
|
||||
log_modulus: CiphertextModulusLog,
|
||||
lwe_dimension: LweDimension,
|
||||
|
||||
streams: &CudaStreams,
|
||||
) {
|
||||
let shift_to_map_to_native = u64::BITS - log_modulus.0 as u32;
|
||||
|
||||
unsafe {
|
||||
//Here i call it with lwe_dimension cause i don't want to change the body
|
||||
cuda_modulus_switch_inplace_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
d_ct.0.d_vec.as_mut_c_ptr(0),
|
||||
lwe_dimension.0 as u32,
|
||||
log_modulus.0 as u32,
|
||||
);
|
||||
}
|
||||
streams.synchronize();
|
||||
let cpu_lwe_list = d_ct.to_lwe_ciphertext_list(streams);
|
||||
|
||||
let mut ct_after_ms =
|
||||
LweCiphertext::from_container(cpu_lwe_list.into_container(), ct.ciphertext_modulus());
|
||||
|
||||
for val in ct_after_ms.get_mut_mask().as_mut() {
|
||||
*val <<= shift_to_map_to_native;
|
||||
}
|
||||
|
||||
*ct = ct_after_ms;
|
||||
}
|
||||
|
||||
fn measure_noise_added_by_message_preserving_operation<C1, C2>(
|
||||
sk: &LweSecretKey<C1>,
|
||||
mut ct: LweCiphertext<C2>,
|
||||
message_preserving_operation: impl Fn(&mut LweCiphertext<C2>),
|
||||
) -> f64
|
||||
where
|
||||
C1: Container<Element = u64>,
|
||||
C2: ContainerMut<Element = u64>,
|
||||
{
|
||||
let decrypted_before = decrypt_lwe_ciphertext(sk, &ct);
|
||||
|
||||
message_preserving_operation(&mut ct);
|
||||
|
||||
let decrypted_after = decrypt_lwe_ciphertext(sk, &ct);
|
||||
|
||||
decrypted_after.0.wrapping_sub(decrypted_before.0) as i64 as f64
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn check_noise_improve_modulus_switch_noise_test_param() {
|
||||
check_noise_improve_modulus_switch_noise(TEST_PARAM);
|
||||
}
|
||||
|
||||
fn check_noise_improve_modulus_switch_noise(
|
||||
ms_noise_reduction_test_params: MsNoiseReductionTestParams,
|
||||
) {
|
||||
let MsNoiseReductionTestParams {
|
||||
lwe_dimension,
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus,
|
||||
modulus_switch_zeros_count,
|
||||
bound,
|
||||
r_sigma_factor,
|
||||
log_modulus,
|
||||
expected_variance_improved,
|
||||
input_variance,
|
||||
} = ms_noise_reduction_test_params;
|
||||
|
||||
let number_loops = 100_000;
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let mut sk = LweSecretKeyOwned::new_empty_key(0, lwe_dimension);
|
||||
|
||||
for sk_bit in sk.as_mut().iter_mut().step_by(2) {
|
||||
*sk_bit = 1;
|
||||
}
|
||||
|
||||
let sk_average_bit: f64 =
|
||||
sk.as_view().into_container().iter().sum::<u64>() as f64 / sk.lwe_dimension().0 as f64;
|
||||
|
||||
println!("sk_average_bit {sk_average_bit:.3}");
|
||||
|
||||
let plaintext_list = PlaintextList::new(0, PlaintextCount(modulus_switch_zeros_count.0));
|
||||
|
||||
let mut encryptions_of_zero = LweCiphertextList::new(
|
||||
0,
|
||||
lwe_dimension.to_lwe_size(),
|
||||
modulus_switch_zeros_count,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
encrypt_lwe_ciphertext_list(
|
||||
&sk,
|
||||
&mut encryptions_of_zero,
|
||||
&plaintext_list,
|
||||
lwe_noise_distribution,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
let gpu_index = 0;
|
||||
let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
|
||||
let d_encryptions_of_zero = CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&encryptions_of_zero,
|
||||
&CudaStreams::new_single_gpu(GpuIndex::new(0)),
|
||||
);
|
||||
let num_streams = 16;
|
||||
let vec_streams = (0..num_streams)
|
||||
.map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
|
||||
.collect::<Vec<_>>();
|
||||
let (ms_errors, ms_errors_improved): (Vec<_>, Vec<_>) = (0..number_loops)
|
||||
.into_par_iter()
|
||||
.map(|index| {
|
||||
let stream_index = index % num_streams as usize;
|
||||
let local_stream = &vec_streams[stream_index];
|
||||
let lwe = TEST_RESOURCES.with(|rsc| {
|
||||
allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&sk,
|
||||
Plaintext(0),
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus,
|
||||
&mut rsc.borrow_mut().encryption_random_generator,
|
||||
)
|
||||
});
|
||||
|
||||
(
|
||||
measure_noise_added_by_message_preserving_operation(&sk, lwe.clone(), |ct| {
|
||||
let mut d_ct = CudaLweCiphertextList::from_lwe_ciphertext(ct, local_stream);
|
||||
round_mask_gpu(ct, &mut d_ct, log_modulus, lwe_dimension, local_stream);
|
||||
}),
|
||||
measure_noise_added_by_message_preserving_operation(&sk, lwe.clone(), |ct| {
|
||||
let mut d_ct = CudaLweCiphertextList::from_lwe_ciphertext(ct, local_stream);
|
||||
let d_ct_in = CudaLweCiphertextList::from_lwe_ciphertext(ct, local_stream);
|
||||
let modulus = lwe.ciphertext_modulus().raw_modulus_float();
|
||||
unsafe {
|
||||
cuda_improve_noise_modulus_switch_64(
|
||||
local_stream.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
d_ct.0.d_vec.as_mut_c_ptr(0),
|
||||
d_ct_in.0.d_vec.as_c_ptr(0),
|
||||
d_encryptions_of_zero.0.d_vec.as_c_ptr(0),
|
||||
lwe_dimension.to_lwe_size().0 as u32,
|
||||
d_ct.lwe_ciphertext_count().0 as u32,
|
||||
d_encryptions_of_zero.lwe_ciphertext_count().0 as u32,
|
||||
input_variance.get_modular_variance(modulus).value,
|
||||
r_sigma_factor.0,
|
||||
bound.0,
|
||||
log_modulus.0 as u32,
|
||||
);
|
||||
}
|
||||
|
||||
round_mask_gpu(ct, &mut d_ct, log_modulus, lwe_dimension, local_stream);
|
||||
}),
|
||||
)
|
||||
})
|
||||
.unzip();
|
||||
|
||||
println!(
|
||||
"mean(&ms_errors) {}2^{:.2}",
|
||||
if mean(&ms_errors) > 0_f64 { "+" } else { "-" },
|
||||
mean(&ms_errors).abs().log2()
|
||||
);
|
||||
|
||||
println!(
|
||||
"mean(&ms_errors_improved) {}2^{:.2}",
|
||||
if mean(&ms_errors_improved) > 0_f64 {
|
||||
"+"
|
||||
} else {
|
||||
"-"
|
||||
},
|
||||
mean(&ms_errors_improved).abs().log2()
|
||||
);
|
||||
|
||||
let base_variance = variance(&ms_errors).0;
|
||||
|
||||
println!(
|
||||
"variance(&ms_errors), 2^{:.2}",
|
||||
base_variance.log2(),
|
||||
);
|
||||
|
||||
let variance_improved = variance(&ms_errors_improved).0;
|
||||
|
||||
println!(
|
||||
"variance(&ms_errors_improved) 2^{:.2}, ratio: {:.3}",
|
||||
variance_improved.log2(),
|
||||
variance_improved / base_variance,
|
||||
);
|
||||
|
||||
let modulus = ciphertext_modulus.raw_modulus_float();
|
||||
|
||||
let expected_base_variance = {
|
||||
let lwe_dim = lwe_dimension.0 as f64;
|
||||
|
||||
let poly_size = 2_f64.powi((log_modulus.0 - 1) as i32);
|
||||
|
||||
(lwe_dim + 2.) * modulus * modulus / (96. * poly_size * poly_size) + (lwe_dim - 4.) / 48.
|
||||
};
|
||||
|
||||
assert!(
|
||||
check_both_ratio_under(base_variance, expected_base_variance, 1.03_f64),
|
||||
"Expected {expected_base_variance}, got {base_variance}",
|
||||
);
|
||||
|
||||
let expected_variance_improved = Variance(expected_variance_improved.0 - input_variance.0)
|
||||
.get_modular_variance(modulus)
|
||||
.value;
|
||||
|
||||
assert!(
|
||||
check_both_ratio_under(variance_improved, expected_variance_improved, 1.03_f64),
|
||||
"Expected {expected_variance_improved}, got {variance_improved}",
|
||||
);
|
||||
}
|
||||
@@ -112,7 +112,7 @@ where
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
|
||||
|
||||
@@ -1,10 +1,46 @@
|
||||
use crate::core_crypto::commons::dispersion::DispersionParameter;
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::{convert_lwe_programmable_bootstrap_key_async, CudaStreams};
|
||||
use crate::core_crypto::gpu::{
|
||||
convert_lwe_programmable_bootstrap_key_async, CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
CudaStreams,
|
||||
};
|
||||
use crate::core_crypto::prelude::{
|
||||
lwe_bootstrap_key_size, Container, DecompositionBaseLog, DecompositionLevelCount,
|
||||
GlweDimension, LweBootstrapKey, LweDimension, PolynomialSize, UnsignedInteger,
|
||||
GlweDimension, LweBootstrapKey, LweDimension, NoiseEstimationMeasureBound, PolynomialSize,
|
||||
RSigmaFactor, UnsignedInteger, Variance,
|
||||
};
|
||||
use crate::shortint::server_key::ModulusSwitchNoiseReductionKey;
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
pub struct CudaModulusSwitchNoiseReductionKey {
|
||||
pub modulus_switch_zeros: CudaVec<u64>,
|
||||
pub ms_bound: NoiseEstimationMeasureBound,
|
||||
pub ms_r_sigma_factor: RSigmaFactor,
|
||||
pub ms_input_variance: Variance,
|
||||
pub num_zeros: u32,
|
||||
}
|
||||
|
||||
pub fn prepare_cuda_ms_noise_reduction_key_ffi(
|
||||
input_ms_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
modulus: f64,
|
||||
) -> CudaModulusSwitchNoiseReductionKeyFFI {
|
||||
input_ms_key.map_or(
|
||||
CudaModulusSwitchNoiseReductionKeyFFI {
|
||||
ptr: std::ptr::null_mut(),
|
||||
num_zeros: 0,
|
||||
ms_bound: 0.0,
|
||||
ms_r_sigma: 0.0,
|
||||
ms_input_variance: 0.0,
|
||||
},
|
||||
|ms_key| CudaModulusSwitchNoiseReductionKeyFFI {
|
||||
ptr: ms_key.modulus_switch_zeros.ptr.as_ptr(),
|
||||
num_zeros: ms_key.num_zeros,
|
||||
ms_bound: ms_key.ms_bound.0,
|
||||
ms_r_sigma: ms_key.ms_r_sigma_factor.0,
|
||||
ms_input_variance: ms_key.ms_input_variance.get_modular_variance(modulus).value,
|
||||
},
|
||||
)
|
||||
}
|
||||
/// A structure representing a vector of GLWE ciphertexts with 64 bits of precision on the GPU.
|
||||
#[derive(Debug)]
|
||||
#[allow(dead_code)]
|
||||
@@ -21,12 +57,15 @@ pub struct CudaLweBootstrapKey {
|
||||
pub(crate) decomp_base_log: DecompositionBaseLog,
|
||||
// Decomposition level count
|
||||
pub(crate) decomp_level_count: DecompositionLevelCount,
|
||||
// Pointer to the noise reduction key
|
||||
pub(crate) d_ms_noise_reduction_key: Option<CudaModulusSwitchNoiseReductionKey>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl CudaLweBootstrapKey {
|
||||
pub fn from_lwe_bootstrap_key<InputBskCont: Container>(
|
||||
bsk: &LweBootstrapKey<InputBskCont>,
|
||||
ms_noise_reduction_key: Option<ModulusSwitchNoiseReductionKey>,
|
||||
streams: &CudaStreams,
|
||||
) -> Self
|
||||
where
|
||||
@@ -65,7 +104,41 @@ impl CudaLweBootstrapKey {
|
||||
decomp_level_count,
|
||||
polynomial_size,
|
||||
);
|
||||
}
|
||||
|
||||
// If noise reduction key is present, copy it to the GPU
|
||||
let d_ms_noise_reduction_key = match ms_noise_reduction_key {
|
||||
Some(ms_noise_red_key) => {
|
||||
let h_input = ms_noise_red_key
|
||||
.modulus_switch_zeros
|
||||
.as_view()
|
||||
.into_container();
|
||||
let lwe_ciphertext_count =
|
||||
ms_noise_red_key.modulus_switch_zeros.lwe_ciphertext_count();
|
||||
let mut d_zeros_vec = CudaVec::new_multi_gpu(
|
||||
input_lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0,
|
||||
streams,
|
||||
);
|
||||
|
||||
unsafe {
|
||||
d_zeros_vec.copy_from_cpu_multi_gpu_async(h_input, streams);
|
||||
}
|
||||
|
||||
streams.synchronize();
|
||||
Some(CudaModulusSwitchNoiseReductionKey {
|
||||
modulus_switch_zeros: d_zeros_vec,
|
||||
num_zeros: ms_noise_red_key
|
||||
.modulus_switch_zeros
|
||||
.lwe_ciphertext_count()
|
||||
.0 as u32,
|
||||
ms_bound: ms_noise_red_key.ms_bound,
|
||||
ms_r_sigma_factor: ms_noise_red_key.ms_r_sigma_factor,
|
||||
ms_input_variance: ms_noise_red_key.ms_input_variance,
|
||||
})
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
streams.synchronize();
|
||||
Self {
|
||||
d_vec,
|
||||
@@ -74,6 +147,7 @@ impl CudaLweBootstrapKey {
|
||||
polynomial_size,
|
||||
decomp_base_log,
|
||||
decomp_level_count,
|
||||
d_ms_noise_reduction_key,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,9 @@ pub mod entities;
|
||||
pub mod slice;
|
||||
pub mod vec;
|
||||
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::{
|
||||
prepare_cuda_ms_noise_reduction_key_ffi, CudaModulusSwitchNoiseReductionKey,
|
||||
};
|
||||
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
|
||||
use crate::core_crypto::prelude::{
|
||||
CiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, GlweCiphertextCount,
|
||||
@@ -117,20 +120,28 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
|
||||
base_log: DecompositionBaseLog,
|
||||
level: DecompositionLevelCount,
|
||||
num_samples: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
let num_many_lut = 1u32;
|
||||
let lut_stride = 0u32;
|
||||
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
|
||||
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
@@ -143,6 +154,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
@@ -180,18 +192,24 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
|
||||
base_log: DecompositionBaseLog,
|
||||
level: DecompositionLevelCount,
|
||||
num_samples: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
|
||||
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
scratch_cuda_programmable_bootstrap_128(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
@@ -201,6 +219,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
|
||||
test_vector.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
@@ -489,6 +508,59 @@ pub unsafe fn extract_lwe_samples_from_glwe_ciphertext_list_async<T: UnsignedInt
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
|
||||
/// required
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
log_modulus: u32,
|
||||
) {
|
||||
cuda_modulus_switch_inplace_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_array_out.len() as u32,
|
||||
log_modulus,
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
|
||||
/// required
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_improve_noise_modulus_switch_ciphertext_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
encrypted_zeros: &CudaVec<T>,
|
||||
lwe_dimension: LweDimension,
|
||||
num_samples: u32,
|
||||
num_zeros: u32,
|
||||
input_variance: f64,
|
||||
r_sigma_factor: f64,
|
||||
bound: f64,
|
||||
log_modulus: u32,
|
||||
) {
|
||||
cuda_improve_noise_modulus_switch_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
encrypted_zeros.as_c_ptr(0),
|
||||
lwe_dimension.to_lwe_size().0 as u32,
|
||||
num_samples,
|
||||
num_zeros,
|
||||
input_variance,
|
||||
r_sigma_factor,
|
||||
bound,
|
||||
log_modulus,
|
||||
);
|
||||
}
|
||||
|
||||
/// Addition of a vector of LWE ciphertexts
|
||||
///
|
||||
/// # Safety
|
||||
|
||||
@@ -721,11 +721,10 @@ mod tests {
|
||||
COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
} else {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
(
|
||||
crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
|
||||
crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
|
||||
.into(),
|
||||
crate::shortint::parameters::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
crate::shortint::parameters::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
},
|
||||
(
|
||||
|
||||
@@ -6,11 +6,7 @@ use crate::shortint::parameters::{
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128, PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::shortint::parameters::{
|
||||
COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
set_server_key, ClientKey, CompactCiphertextList, CompactCiphertextListExpander,
|
||||
CompactPublicKey, CompressedCiphertextList, CompressedCiphertextListBuilder, CompressedFheBool,
|
||||
@@ -152,12 +148,11 @@ fn test_tag_propagation_zk_pok() {
|
||||
fn test_tag_propagation_gpu() {
|
||||
test_tag_propagation(
|
||||
Device::CudaGpu,
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
None,
|
||||
Some(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64),
|
||||
Some(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128),
|
||||
Some((
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)),
|
||||
)
|
||||
|
||||
@@ -676,9 +676,8 @@ mod tests {
|
||||
|
||||
for (params, comp_params) in [
|
||||
(
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64.into(),
|
||||
COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
|
||||
COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
),
|
||||
(
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
|
||||
|
||||
@@ -79,7 +79,7 @@ impl RadixClientKey {
|
||||
});
|
||||
|
||||
let blind_rotate_key = CudaBootstrappingKey::Classic(
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, streams),
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, streams),
|
||||
);
|
||||
|
||||
let cuda_decompression_key = CudaDecompressionKey {
|
||||
|
||||
@@ -27,7 +27,7 @@ impl CompressedDecompressionKey {
|
||||
.par_decompress_into_lwe_bootstrap_key();
|
||||
|
||||
let d_bootstrap_key =
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams);
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, None, streams);
|
||||
|
||||
let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key);
|
||||
|
||||
|
||||
@@ -263,6 +263,10 @@ impl CudaDecompressionKey {
|
||||
|
||||
match &self.blind_rotate_key {
|
||||
CudaBootstrappingKey::Classic(bsk) => {
|
||||
assert!(
|
||||
bsk.d_ms_noise_reduction_key.is_none(),
|
||||
"Decompression key should not do modulus switch noise reduction"
|
||||
);
|
||||
let lwe_dimension = bsk.output_lwe_dimension();
|
||||
|
||||
let mut output_lwe = CudaLweCiphertextList::new(
|
||||
|
||||
@@ -3,6 +3,9 @@ pub mod client_key;
|
||||
pub mod list_compression;
|
||||
pub mod server_key;
|
||||
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::{
|
||||
prepare_cuda_ms_noise_reduction_key_ffi, CudaModulusSwitchNoiseReductionKey,
|
||||
};
|
||||
use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
@@ -303,6 +306,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
num_scalars: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -325,6 +329,10 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = lwe_array.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut lwe_array_noise_levels = lwe_array
|
||||
@@ -356,6 +364,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
@@ -368,6 +377,7 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
polynomial_size.0 as u32,
|
||||
message_modulus.0 as u32,
|
||||
num_scalars,
|
||||
@@ -528,6 +538,7 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
storage_log_modulus,
|
||||
bodies_count,
|
||||
true,
|
||||
false,
|
||||
);
|
||||
|
||||
cuda_integer_decompress_radix_ciphertext_64(
|
||||
@@ -649,6 +660,7 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -678,6 +690,13 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_left
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = radix_lwe_left
|
||||
.info
|
||||
@@ -739,6 +758,7 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
num_blocks,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -751,6 +771,7 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
is_boolean_right,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
mem_ptr,
|
||||
polynomial_size.0 as u32,
|
||||
num_blocks,
|
||||
@@ -789,6 +810,7 @@ pub unsafe fn unchecked_bitop_integer_radix_kb_assign_async<T: UnsignedInteger,
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -818,6 +840,13 @@ pub unsafe fn unchecked_bitop_integer_radix_kb_assign_async<T: UnsignedInteger,
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_left
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = radix_lwe_left
|
||||
.info
|
||||
@@ -879,6 +908,7 @@ pub unsafe fn unchecked_bitop_integer_radix_kb_assign_async<T: UnsignedInteger,
|
||||
pbs_type as u32,
|
||||
op as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -890,6 +920,7 @@ pub unsafe fn unchecked_bitop_integer_radix_kb_assign_async<T: UnsignedInteger,
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_bitop(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -929,6 +960,7 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -958,6 +990,10 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_degrees = radix_lwe.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_noise_levels = radix_lwe
|
||||
@@ -991,6 +1027,7 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
|
||||
pbs_type as u32,
|
||||
op as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1004,6 +1041,7 @@ pub unsafe fn unchecked_scalar_bitop_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_bitop(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1040,6 +1078,7 @@ pub unsafe fn unchecked_comparison_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
is_signed: bool,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1076,6 +1115,14 @@ pub unsafe fn unchecked_comparison_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = radix_lwe_left
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_out_degrees = radix_lwe_out
|
||||
.info
|
||||
@@ -1151,6 +1198,7 @@ pub unsafe fn unchecked_comparison_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
op as u32,
|
||||
is_signed,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
@@ -1163,6 +1211,7 @@ pub unsafe fn unchecked_comparison_integer_radix_kb_async<T: UnsignedInteger, B:
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
|
||||
cleanup_cuda_integer_comparison(
|
||||
@@ -1202,6 +1251,7 @@ pub unsafe fn unchecked_scalar_comparison_integer_radix_kb_async<T: UnsignedInte
|
||||
signed_with_positive_scalar: bool,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1238,6 +1288,13 @@ pub unsafe fn unchecked_scalar_comparison_integer_radix_kb_async<T: UnsignedInte
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_in
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_out_degrees = radix_lwe_out
|
||||
.info
|
||||
@@ -1294,6 +1351,7 @@ pub unsafe fn unchecked_scalar_comparison_integer_radix_kb_async<T: UnsignedInte
|
||||
op as u32,
|
||||
signed_with_positive_scalar,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
@@ -1307,6 +1365,7 @@ pub unsafe fn unchecked_scalar_comparison_integer_radix_kb_async<T: UnsignedInte
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
num_scalar_blocks,
|
||||
);
|
||||
|
||||
@@ -1341,6 +1400,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
|
||||
carry_modulus: CarryModulus,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1363,6 +1423,13 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_input_degrees = radix_lwe_input
|
||||
.info
|
||||
@@ -1398,6 +1465,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_full_propagation_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1406,6 +1474,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
|
||||
&mut cuda_ffi_radix_lwe_input,
|
||||
mem_ptr,
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
num_blocks,
|
||||
);
|
||||
@@ -1444,6 +1513,7 @@ pub(crate) unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B:
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
requested_flag: OutputFlag,
|
||||
uses_carry: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1466,6 +1536,14 @@ pub(crate) unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B:
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = radix_lwe_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
|
||||
let mut radix_lwe_input_degrees = radix_lwe_input
|
||||
@@ -1527,6 +1605,7 @@ pub(crate) unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B:
|
||||
requested_flag as u32,
|
||||
uses_carry,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_propagate_single_carry_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1538,6 +1617,7 @@ pub(crate) unsafe fn propagate_single_carry_assign_async<T: UnsignedInteger, B:
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
requested_flag as u32,
|
||||
uses_carry,
|
||||
);
|
||||
@@ -1578,6 +1658,7 @@ pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T: UnsignedInte
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
requested_flag: OutputFlag,
|
||||
uses_carry: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1621,6 +1702,11 @@ pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T: UnsignedInte
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = lhs_input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
|
||||
let mut lhs_input_degrees = lhs_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
@@ -1689,6 +1775,7 @@ pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T: UnsignedInte
|
||||
requested_flag as u32,
|
||||
uses_carry,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1701,6 +1788,7 @@ pub(crate) unsafe fn add_and_propagate_single_carry_assign_async<T: UnsignedInte
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
requested_flag as u32,
|
||||
uses_carry,
|
||||
);
|
||||
@@ -1741,6 +1829,7 @@ pub unsafe fn unchecked_scalar_left_shift_integer_radix_kb_assign_async<
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1763,6 +1852,11 @@ pub unsafe fn unchecked_scalar_left_shift_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels =
|
||||
@@ -1793,6 +1887,7 @@ pub unsafe fn unchecked_scalar_left_shift_integer_radix_kb_assign_async<
|
||||
pbs_type as u32,
|
||||
ShiftRotateType::LeftShift as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1803,6 +1898,7 @@ pub unsafe fn unchecked_scalar_left_shift_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1840,6 +1936,7 @@ pub unsafe fn unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1862,6 +1959,11 @@ pub unsafe fn unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels =
|
||||
@@ -1892,6 +1994,7 @@ pub unsafe fn unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async
|
||||
pbs_type as u32,
|
||||
ShiftRotateType::RightShift as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1902,6 +2005,7 @@ pub unsafe fn unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -1938,6 +2042,7 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -1960,6 +2065,11 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels =
|
||||
@@ -1990,6 +2100,7 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
|
||||
pbs_type as u32,
|
||||
ShiftRotateType::RightShift as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2000,6 +2111,7 @@ pub unsafe fn unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_as
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_arithmetic_scalar_shift(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2038,6 +2150,7 @@ pub unsafe fn unchecked_right_shift_integer_radix_kb_assign_async<
|
||||
is_signed: bool,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2067,7 +2180,13 @@ pub unsafe fn unchecked_right_shift_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = radix_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut radix_lwe_left_degrees = radix_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_input
|
||||
.info
|
||||
@@ -2116,6 +2235,7 @@ pub unsafe fn unchecked_right_shift_integer_radix_kb_assign_async<
|
||||
ShiftRotateType::RightShift as u32,
|
||||
is_signed,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2126,6 +2246,7 @@ pub unsafe fn unchecked_right_shift_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_shift_and_rotate(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2161,6 +2282,7 @@ pub unsafe fn unchecked_left_shift_integer_radix_kb_assign_async<T: UnsignedInte
|
||||
is_signed: bool,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2190,7 +2312,13 @@ pub unsafe fn unchecked_left_shift_integer_radix_kb_assign_async<T: UnsignedInte
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = radix_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut radix_lwe_left_degrees = radix_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_input
|
||||
.info
|
||||
@@ -2239,6 +2367,7 @@ pub unsafe fn unchecked_left_shift_integer_radix_kb_assign_async<T: UnsignedInte
|
||||
ShiftRotateType::LeftShift as u32,
|
||||
is_signed,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2249,6 +2378,7 @@ pub unsafe fn unchecked_left_shift_integer_radix_kb_assign_async<T: UnsignedInte
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_shift_and_rotate(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2287,6 +2417,7 @@ pub unsafe fn unchecked_rotate_right_integer_radix_kb_assign_async<
|
||||
is_signed: bool,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2316,7 +2447,13 @@ pub unsafe fn unchecked_rotate_right_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = radix_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut radix_lwe_left_degrees = radix_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_input
|
||||
.info
|
||||
@@ -2370,6 +2507,7 @@ pub unsafe fn unchecked_rotate_right_integer_radix_kb_assign_async<
|
||||
ShiftRotateType::RightRotate as u32,
|
||||
is_signed,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2380,6 +2518,7 @@ pub unsafe fn unchecked_rotate_right_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_shift_and_rotate(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2418,6 +2557,7 @@ pub unsafe fn unchecked_rotate_left_integer_radix_kb_assign_async<
|
||||
is_signed: bool,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2447,7 +2587,13 @@ pub unsafe fn unchecked_rotate_left_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = radix_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut radix_lwe_left_degrees = radix_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_input
|
||||
.info
|
||||
@@ -2501,6 +2647,7 @@ pub unsafe fn unchecked_rotate_left_integer_radix_kb_assign_async<
|
||||
ShiftRotateType::LeftRotate as u32,
|
||||
is_signed,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2511,6 +2658,7 @@ pub unsafe fn unchecked_rotate_left_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_shift_and_rotate(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2547,6 +2695,7 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2603,6 +2752,14 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_out
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
|
||||
let mut radix_lwe_out_degrees = radix_lwe_out
|
||||
.info
|
||||
.blocks
|
||||
@@ -2695,6 +2852,7 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2707,6 +2865,7 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async<T: UnsignedInteger, B: Numer
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_cmux(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2744,6 +2903,7 @@ pub unsafe fn unchecked_scalar_rotate_left_integer_radix_kb_assign_async<
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2766,6 +2926,13 @@ pub unsafe fn unchecked_scalar_rotate_left_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = radix_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_input
|
||||
@@ -2799,6 +2966,7 @@ pub unsafe fn unchecked_scalar_rotate_left_integer_radix_kb_assign_async<
|
||||
pbs_type as u32,
|
||||
ShiftRotateType::LeftShift as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2809,6 +2977,7 @@ pub unsafe fn unchecked_scalar_rotate_left_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_scalar_rotate(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2846,6 +3015,7 @@ pub unsafe fn unchecked_scalar_rotate_right_integer_radix_kb_assign_async<
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2868,6 +3038,13 @@ pub unsafe fn unchecked_scalar_rotate_right_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_input
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = radix_input.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_input
|
||||
@@ -2901,6 +3078,7 @@ pub unsafe fn unchecked_scalar_rotate_right_integer_radix_kb_assign_async<
|
||||
pbs_type as u32,
|
||||
ShiftRotateType::RightShift as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2911,6 +3089,7 @@ pub unsafe fn unchecked_scalar_rotate_right_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_scalar_rotate(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -2948,6 +3127,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
num_radixes: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -2977,6 +3157,11 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_list.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut result_degrees = result.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut result_noise_levels = result.info.blocks.iter().map(|b| b.noise_level.0).collect();
|
||||
@@ -3013,6 +3198,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3023,6 +3209,7 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3060,6 +3247,8 @@ pub unsafe fn apply_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
carry_modulus: CarryModulus,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3089,6 +3278,10 @@ pub unsafe fn apply_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
|
||||
output,
|
||||
@@ -3124,6 +3317,7 @@ pub unsafe fn apply_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
pbs_type as u32,
|
||||
lut_degree,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_apply_univariate_lut_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3133,6 +3327,7 @@ pub unsafe fn apply_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
&cuda_ffi_input,
|
||||
mem_ptr,
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
);
|
||||
cleanup_cuda_apply_univariate_lut_kb_64(
|
||||
@@ -3172,6 +3367,8 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3201,6 +3398,9 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
|
||||
output,
|
||||
@@ -3237,6 +3437,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
|
||||
num_many_lut,
|
||||
lut_degree,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_apply_many_univariate_lut_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3246,6 +3447,7 @@ pub unsafe fn apply_many_univariate_lut_kb_async<T: UnsignedInteger, B: Numeric>
|
||||
&cuda_ffi_input,
|
||||
mem_ptr,
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
@@ -3287,6 +3489,8 @@ pub unsafe fn apply_bivariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
shift: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3323,6 +3527,10 @@ pub unsafe fn apply_bivariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
|
||||
output,
|
||||
@@ -3365,6 +3573,7 @@ pub unsafe fn apply_bivariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
pbs_type as u32,
|
||||
lut_degree,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_apply_bivariate_lut_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3375,6 +3584,7 @@ pub unsafe fn apply_bivariate_lut_kb_async<T: UnsignedInteger, B: Numeric>(
|
||||
&cuda_ffi_input_2,
|
||||
mem_ptr,
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
num_blocks,
|
||||
shift,
|
||||
@@ -3414,6 +3624,7 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async<T: UnsignedInteger
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3457,6 +3668,10 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async<T: UnsignedInteger
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = numerator.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut quotient_degrees = quotient.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut quotient_noise_levels = quotient
|
||||
@@ -3520,6 +3735,7 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async<T: UnsignedInteger
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3533,6 +3749,7 @@ pub unsafe fn unchecked_div_rem_integer_radix_kb_assign_async<T: UnsignedInteger
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_div_rem(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3573,6 +3790,8 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
|
||||
carry_modulus: CarryModulus,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3602,6 +3821,10 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
|
||||
output,
|
||||
@@ -3637,6 +3860,7 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
|
||||
pbs_type as u32,
|
||||
lut_degree,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
@@ -3647,6 +3871,7 @@ pub unsafe fn compute_prefix_sum_hillis_steele_async<T: UnsignedInteger, B: Nume
|
||||
&mut cuda_ffi_generates_or_propagates,
|
||||
mem_ptr,
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
num_blocks,
|
||||
);
|
||||
@@ -3737,6 +3962,7 @@ pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
compute_overflow: bool,
|
||||
uses_input_borrow: u32,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3766,6 +3992,13 @@ pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_left
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let big_lwe_dimension: u32 = glwe_dimension.0 as u32 * polynomial_size.0 as u32;
|
||||
let mut radix_lwe_left_degrees = radix_lwe_left
|
||||
@@ -3843,6 +4076,7 @@ pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_
|
||||
pbs_type as u32,
|
||||
compute_overflow as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3855,6 +4089,7 @@ pub(crate) unsafe fn unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
compute_overflow as u32,
|
||||
uses_input_borrow,
|
||||
);
|
||||
@@ -3891,6 +4126,7 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
num_blocks: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -3913,6 +4149,11 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
|
||||
let ct_modulus = ct.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut ct_degrees = ct.info.blocks.iter().map(|b| b.degree.0).collect();
|
||||
let mut ct_noise_levels = ct.info.blocks.iter().map(|b| b.noise_level.0).collect();
|
||||
@@ -3937,6 +4178,7 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3947,6 +4189,7 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async<T: UnsignedInteger, B:
|
||||
true,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_abs_inplace(
|
||||
streams.ptr.as_ptr(),
|
||||
@@ -3982,6 +4225,7 @@ pub unsafe fn unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -4011,6 +4255,13 @@ pub unsafe fn unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_in
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_out_degrees = radix_lwe_out
|
||||
.info
|
||||
@@ -4065,6 +4316,7 @@ pub unsafe fn unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
@@ -4076,6 +4328,7 @@ pub unsafe fn unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
radix_lwe_in.d_blocks.lwe_ciphertext_count().0 as u32,
|
||||
);
|
||||
|
||||
@@ -4114,6 +4367,7 @@ pub unsafe fn unchecked_are_all_comparisons_block_true_integer_radix_kb_async<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -4143,6 +4397,13 @@ pub unsafe fn unchecked_are_all_comparisons_block_true_integer_radix_kb_async<
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_in
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut radix_lwe_out_degrees = radix_lwe_out
|
||||
.info
|
||||
.blocks
|
||||
@@ -4197,6 +4458,7 @@ pub unsafe fn unchecked_are_all_comparisons_block_true_integer_radix_kb_async<
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
@@ -4208,6 +4470,7 @@ pub unsafe fn unchecked_are_all_comparisons_block_true_integer_radix_kb_async<
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&ms_noise_reduction_key_ffi,
|
||||
radix_lwe_in.d_blocks.lwe_ciphertext_count().0 as u32,
|
||||
);
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ use crate::integer::server_key::num_bits_to_represent_unsigned_value;
|
||||
use crate::integer::ClientKey;
|
||||
use crate::shortint::ciphertext::{MaxDegree, MaxNoiseLevel};
|
||||
use crate::shortint::engine::ShortintEngine;
|
||||
use crate::shortint::server_key::ModulusSwitchNoiseReductionKey;
|
||||
use crate::shortint::{CarryModulus, CiphertextModulus, MessageModulus, PBSOrder};
|
||||
mod radix;
|
||||
|
||||
@@ -95,9 +96,22 @@ impl CudaServerKey {
|
||||
pbs_params.ciphertext_modulus,
|
||||
&mut engine.encryption_generator,
|
||||
);
|
||||
|
||||
let d_bootstrap_key =
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams);
|
||||
let modulus_switch_noise_reduction_key = pbs_params
|
||||
.modulus_switch_noise_reduction_params
|
||||
.map(|modulus_switch_noise_reduction_params| {
|
||||
ModulusSwitchNoiseReductionKey::new(
|
||||
modulus_switch_noise_reduction_params,
|
||||
&cks.key.small_lwe_secret_key(),
|
||||
&mut engine,
|
||||
pbs_params.ciphertext_modulus,
|
||||
pbs_params.lwe_noise_distribution,
|
||||
)
|
||||
});
|
||||
let d_bootstrap_key = CudaLweBootstrapKey::from_lwe_bootstrap_key(
|
||||
&h_bootstrap_key,
|
||||
modulus_switch_noise_reduction_key,
|
||||
streams,
|
||||
);
|
||||
|
||||
CudaBootstrappingKey::Classic(d_bootstrap_key)
|
||||
}
|
||||
@@ -210,15 +224,13 @@ impl CudaServerKey {
|
||||
let key_switching_key =
|
||||
CudaLweKeyswitchKey::from_lwe_keyswitch_key(&h_key_switching_key, streams);
|
||||
let bootstrapping_key = match bootstrapping_key {
|
||||
crate::shortint::server_key::compressed::ShortintCompressedBootstrappingKey::Classic{ bsk: h_bootstrap_key, modulus_switch_noise_reduction_key } => {
|
||||
crate::shortint::server_key::compressed::ShortintCompressedBootstrappingKey::Classic{ bsk: h_bootstrap_key, modulus_switch_noise_reduction_key, } => {
|
||||
|
||||
assert!(modulus_switch_noise_reduction_key.is_none(), "Modulus Switch Noise Reduction is not yet support on GPU");
|
||||
|
||||
let standard_bootstrapping_key =
|
||||
h_bootstrap_key.par_decompress_into_lwe_bootstrap_key();
|
||||
let ms_noise_reduction_key = modulus_switch_noise_reduction_key.map(|msnr| msnr.decompress());
|
||||
let standard_bootstrapping_key = h_bootstrap_key.par_decompress_into_lwe_bootstrap_key();
|
||||
|
||||
let d_bootstrap_key =
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&standard_bootstrapping_key, streams);
|
||||
CudaLweBootstrapKey::from_lwe_bootstrap_key(&standard_bootstrapping_key, ms_noise_reduction_key, streams);
|
||||
|
||||
CudaBootstrappingKey::Classic(d_bootstrap_key)
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ impl CudaServerKey {
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -65,6 +66,7 @@ impl CudaServerKey {
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -284,6 +284,7 @@ impl CudaServerKey {
|
||||
radix_count_in_vec as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -308,6 +309,7 @@ impl CudaServerKey {
|
||||
radix_count_in_vec as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,6 +209,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -236,6 +237,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -77,6 +78,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,6 +75,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -103,6 +104,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -985,6 +987,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -1013,6 +1016,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1079,6 +1083,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -1107,6 +1112,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@ impl CudaServerKey {
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -78,6 +79,7 @@ impl CudaServerKey {
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -258,6 +258,7 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
requested_flag,
|
||||
uses_carry,
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -282,6 +283,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.grouping_factor,
|
||||
requested_flag,
|
||||
uses_carry,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -335,6 +337,7 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
requested_flag,
|
||||
uses_carry,
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -360,6 +363,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.grouping_factor,
|
||||
requested_flag,
|
||||
uses_carry,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -393,6 +397,7 @@ impl CudaServerKey {
|
||||
ciphertext.info.blocks.first().unwrap().carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -413,6 +418,7 @@ impl CudaServerKey {
|
||||
ciphertext.info.blocks.first().unwrap().carry_modulus,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -904,6 +910,7 @@ impl CudaServerKey {
|
||||
let mut output_noise_levels = vec![0_u64; num_output_blocks];
|
||||
|
||||
let num_ct_blocks = block_range.len() as u32;
|
||||
let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -931,6 +938,8 @@ impl CudaServerKey {
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -958,6 +967,8 @@ impl CudaServerKey {
|
||||
self.carry_modulus,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1020,6 +1031,7 @@ impl CudaServerKey {
|
||||
let mut output_noise_levels = vec![0_u64; num_output_blocks];
|
||||
|
||||
let num_ct_blocks = block_range.len() as u32;
|
||||
let ct_modulus = input_1.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1049,6 +1061,8 @@ impl CudaServerKey {
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
self.message_modulus.0 as u32,
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -1078,6 +1092,8 @@ impl CudaServerKey {
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
self.message_modulus.0 as u32,
|
||||
None,
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1100,7 +1116,7 @@ impl CudaServerKey {
|
||||
/// use tfhe::shortint::gen_keys;
|
||||
/// use tfhe::shortint::parameters::{
|
||||
/// PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
/// PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
/// PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
|
||||
/// };
|
||||
/// {
|
||||
/// // Generate the client key and the server key:
|
||||
@@ -1140,11 +1156,11 @@ impl CudaServerKey {
|
||||
/// }
|
||||
/// {
|
||||
/// // Generate the client key and the server key:
|
||||
/// let (cks, sks) = gen_keys(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
/// let (cks, sks) = gen_keys(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128);
|
||||
/// let gpu_index = 0;
|
||||
/// let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
/// // Generate the client key and the server key:
|
||||
/// let (cks, sks) = gen_keys_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64, &streams);
|
||||
/// let (cks, sks) = gen_keys_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128, &streams);
|
||||
/// let num_blocks = 2;
|
||||
/// let msg = 3;
|
||||
/// let ct = cks.encrypt_radix(msg, num_blocks);
|
||||
@@ -1197,6 +1213,7 @@ impl CudaServerKey {
|
||||
.unwrap();
|
||||
let mut output_degrees = vec![0_u64; num_ct_blocks * function_count];
|
||||
let mut output_noise_levels = vec![0_u64; num_ct_blocks * function_count];
|
||||
let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1226,6 +1243,8 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
function_count as u32,
|
||||
lut.sample_extraction_stride as u32,
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -1255,6 +1274,8 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.grouping_factor,
|
||||
function_count as u32,
|
||||
lut.sample_extraction_stride as u32,
|
||||
None,
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1320,6 +1341,7 @@ impl CudaServerKey {
|
||||
.unwrap();
|
||||
let mut generates_or_propagates_degrees = vec![0; num_blocks];
|
||||
let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
|
||||
let ct_modulus = output.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let mut output_slice = output
|
||||
.d_blocks
|
||||
.0
|
||||
@@ -1328,7 +1350,6 @@ impl CudaServerKey {
|
||||
.unwrap();
|
||||
let mut output_degrees = vec![0_u64; num_blocks];
|
||||
let mut output_noise_levels = vec![0_u64; num_blocks];
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1358,6 +1379,8 @@ impl CudaServerKey {
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -1387,6 +1410,8 @@ impl CudaServerKey {
|
||||
self.carry_modulus,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1455,7 +1480,12 @@ impl CudaServerKey {
|
||||
let mut padding_block_degree = vec![0_u64; 1];
|
||||
let mut padding_block_noise_level = vec![0_u64; 1];
|
||||
let mut new_blocks = new_blocks.unwrap();
|
||||
|
||||
let ct_modulus = ct
|
||||
.to_owned()
|
||||
.as_ref()
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
apply_univariate_lut_kb_async(
|
||||
@@ -1482,6 +1512,8 @@ impl CudaServerKey {
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -1509,6 +1541,8 @@ impl CudaServerKey {
|
||||
self.carry_modulus,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,6 +97,7 @@ impl CudaServerKey {
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -120,6 +121,7 @@ impl CudaServerKey {
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -76,6 +77,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -168,6 +170,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -195,6 +198,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -88,6 +89,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,6 +200,7 @@ impl CudaServerKey {
|
||||
signed_with_positive_scalar,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -230,6 +231,7 @@ impl CudaServerKey {
|
||||
signed_with_positive_scalar,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -359,6 +361,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -389,6 +392,7 @@ impl CudaServerKey {
|
||||
T::IS_SIGNED,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -430,6 +434,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -455,6 +460,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -498,6 +504,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -523,6 +530,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,6 +133,7 @@ impl CudaServerKey {
|
||||
decomposed_scalar.len() as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -157,6 +158,7 @@ impl CudaServerKey {
|
||||
decomposed_scalar.len() as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,6 +68,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -94,6 +95,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -175,6 +177,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -201,6 +204,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -96,6 +97,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -214,6 +216,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -239,6 +242,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -268,6 +272,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -294,6 +299,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -609,6 +615,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -635,6 +642,7 @@ impl CudaServerKey {
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -76,6 +77,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -166,6 +168,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -193,6 +196,7 @@ impl CudaServerKey {
|
||||
is_signed,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -397,6 +397,7 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
compute_overflow,
|
||||
uses_input_borrow,
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
@@ -421,6 +422,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.grouping_factor,
|
||||
compute_overflow,
|
||||
uses_input_borrow,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,14 +45,12 @@ macro_rules! define_gpu_signed_comparison_test_functions {
|
||||
|
||||
// Then call our create_gpu_parameterized_test macro onto or specialized fns
|
||||
create_gpu_parameterized_test!([<integer_signed_unchecked_ $comparison_name _ $clear_type>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!([<integer_signed_default_ $comparison_name _ $clear_type>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
@@ -93,26 +91,22 @@ where
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!(integer_signed_unchecked_max_128_bits {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_signed_unchecked_min_128_bits {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_signed_max_128_bits {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_signed_min_128_bits {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
|
||||
@@ -43,14 +43,12 @@ macro_rules! define_gpu_signed_scalar_comparison_test_functions {
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!([<integer_signed_unchecked_scalar_ $comparison_name _ $clear_type>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!([<integer_signed_default_scalar_ $comparison_name $clear_type>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
@@ -91,26 +89,22 @@ where
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!(integer_signed_unchecked_scalar_max_i128 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_signed_unchecked_scalar_min_i128 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_signed_scalar_max_i128 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_signed_scalar_min_i128 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
|
||||
@@ -43,9 +43,8 @@ macro_rules! create_gpu_parameterized_test{
|
||||
($name:ident)=> {
|
||||
create_gpu_parameterized_test!($name
|
||||
{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
TEST_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
|
||||
@@ -58,20 +58,17 @@ macro_rules! define_gpu_comparison_test_functions {
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!([<integer_unchecked_ $comparison_name _ $clear_type:lower>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!([<integer_default_ $comparison_name _ $clear_type:lower>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!([<multi_device_integer_default_ $comparison_name _ $clear_type:lower>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
@@ -112,26 +109,22 @@ where
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!(integer_unchecked_min_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_unchecked_max_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_min_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_max_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
|
||||
@@ -46,14 +46,12 @@ macro_rules! define_gpu_scalar_comparison_test_functions {
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!([<integer_unchecked_scalar_ $comparison_name _ $clear_type:lower>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!([<integer_default_scalar_ $comparison_name$clear_type:lower>]{
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
@@ -327,26 +325,22 @@ where
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!(integer_unchecked_scalar_min_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_unchecked_scalar_max_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_scalar_min_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_scalar_max_u256 {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
@@ -359,14 +353,12 @@ define_gpu_scalar_comparison_test_functions!(gt, U256);
|
||||
define_gpu_scalar_comparison_test_functions!(ge, U256);
|
||||
|
||||
create_gpu_parameterized_test!(integer_unchecked_scalar_comparisons_edge {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
create_gpu_parameterized_test!(integer_unchecked_scalar_comparisons_edge_one_block {
|
||||
// TODO GPU DRIFT UPDATE
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
TEST_PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user