mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): internal renaming
This commit is contained in:
@@ -106,7 +106,7 @@ typedef struct {
|
||||
uint32_t polynomial_size;
|
||||
} CudaPackedGlweCiphertextListFFI;
|
||||
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
uint64_t scratch_cuda_apply_univariate_lut_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -114,7 +114,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -122,15 +122,16 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks);
|
||||
void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -139,17 +140,17 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
void cuda_apply_bivariate_lut_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift);
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void cuda_apply_many_univariate_lut_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_luts,
|
||||
@@ -171,7 +172,7 @@ void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
|
||||
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
|
||||
bool const is_boolean_right, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
@@ -180,7 +181,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint32_t num_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void cuda_integer_mult_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
@@ -189,17 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
|
||||
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t num_radix_blocks);
|
||||
void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks);
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void cuda_scalar_addition_ciphertext_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
|
||||
uint32_t message_modulus, uint32_t carry_modulus);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
uint64_t scratch_cuda_logical_scalar_shift_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -208,11 +210,12 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
uint64_t scratch_cuda_arithmetic_scalar_shift_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -221,17 +224,19 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
uint64_t scratch_cuda_shift_and_rotate_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -240,15 +245,16 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
uint64_t scratch_cuda_comparison_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -258,13 +264,14 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
bool is_signed, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void cuda_scalar_comparison_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
@@ -273,7 +280,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_bitop_kb_64(
|
||||
uint64_t scratch_cuda_bitop_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -282,13 +289,14 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void cuda_scalar_bitop_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
@@ -296,26 +304,28 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
|
||||
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_cmux_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
uint64_t scratch_cuda_scalar_rotate_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -324,14 +334,14 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t n, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint64_t scratch_cuda_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -340,7 +350,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -349,13 +359,13 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void cuda_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
void cuda_add_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
@@ -367,7 +377,7 @@ void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
|
||||
void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -376,7 +386,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void cuda_integer_overflowing_sub_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
@@ -387,7 +397,7 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -397,15 +407,16 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint64_t scratch_cuda_integer_scalar_mul_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -413,16 +424,15 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void cuda_scalar_multiplication_ciphertext_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
@@ -431,7 +441,7 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void cuda_integer_div_rem_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
@@ -460,7 +470,7 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array);
|
||||
|
||||
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
@@ -469,14 +479,14 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
|
||||
bool is_signed, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -485,7 +495,7 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void cuda_integer_are_all_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
|
||||
@@ -493,7 +503,7 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void cleanup_cuda_integer_are_all_comparisons_block_true(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -502,7 +512,7 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
|
||||
@@ -518,7 +528,7 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
CudaStreamsFFI streams);
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
uint64_t scratch_cuda_apply_noise_squashing(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
|
||||
@@ -528,15 +538,16 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks);
|
||||
void cuda_apply_noise_squashing(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -545,7 +556,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void cuda_sub_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
@@ -554,7 +565,7 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -563,13 +574,13 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi);
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
@@ -590,7 +601,7 @@ void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
|
||||
void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -599,15 +610,15 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -617,7 +628,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
@@ -626,10 +637,10 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks);
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -639,7 +650,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
@@ -647,10 +658,10 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits);
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
uint64_t scratch_integer_count_of_consecutive_bits_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -659,13 +670,13 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
Direction direction, BitValue bit_value, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
void cuda_integer_count_of_consecutive_bits_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
@@ -676,16 +687,16 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
bool allocate_gpu_memory, uint32_t message_bits_per_block,
|
||||
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process,
|
||||
int8_t *mem, void *const *bsks);
|
||||
void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process, int8_t *mem,
|
||||
void *const *bsks);
|
||||
|
||||
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_integer_ilog2_kb_64(
|
||||
uint64_t scratch_integer_ilog2_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -694,7 +705,7 @@ uint64_t scratch_integer_ilog2_kb_64(
|
||||
uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_ilog2_kb_64(
|
||||
void cuda_integer_ilog2_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
@@ -702,8 +713,8 @@ void cuda_integer_ilog2_kb_64(
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
} // extern C
|
||||
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
@@ -111,9 +111,9 @@ aes_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI *data,
|
||||
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, data, data, bsks, ksks, mem->luts->flush_lut,
|
||||
data->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(streams, data, data, bsks,
|
||||
ksks, mem->luts->flush_lut,
|
||||
data->num_radix_blocks);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -126,8 +126,8 @@ __host__ __forceinline__ void aes_scalar_add_one_flush_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *data,
|
||||
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, data, mem->params.message_modulus, mem->params.carry_modulus);
|
||||
host_add_scalar_one_inplace<Torus>(streams, data, mem->params.message_modulus,
|
||||
mem->params.carry_modulus);
|
||||
|
||||
aes_flush_inplace(streams, data, mem, bsks, ksks);
|
||||
}
|
||||
@@ -167,7 +167,7 @@ batch_vec_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI **targets,
|
||||
&dest_slice, targets[i]);
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &batch_out, &batch_in, bsks, ksks, mem->luts->flush_lut,
|
||||
batch_out.num_radix_blocks);
|
||||
|
||||
@@ -220,7 +220,7 @@ __host__ void batch_vec_and_inplace(CudaStreams streams,
|
||||
&dest_rhs_slice, rhs[i]);
|
||||
}
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, &batch_out, &batch_lhs, &batch_rhs, bsks, ksks,
|
||||
mem->luts->and_lut, batch_out.num_radix_blocks,
|
||||
mem->params.message_modulus);
|
||||
@@ -358,9 +358,9 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
|
||||
|
||||
#define ADD_ONE(target) \
|
||||
do { \
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>( \
|
||||
streams, target, mem->params.message_modulus, \
|
||||
mem->params.carry_modulus); \
|
||||
host_add_scalar_one_inplace<Torus>(streams, target, \
|
||||
mem->params.message_modulus, \
|
||||
mem->params.carry_modulus); \
|
||||
} while (0)
|
||||
|
||||
// Homomorphic S-Box Circuit Evaluation
|
||||
@@ -1057,7 +1057,7 @@ __host__ void vectorized_aes_full_adder_inplace(
|
||||
// The carry_lut applies the function f(x) = (x >> 1) & 1, which
|
||||
// extracts the carry bit from the previous sum. The result is stored
|
||||
// in carry_vec for the next iteration (i+1).
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, carry_vec, sum_plus_carry_vec, bsks, ksks,
|
||||
mem->luts->carry_lut, num_aes_inputs);
|
||||
|
||||
@@ -1065,7 +1065,7 @@ __host__ void vectorized_aes_full_adder_inplace(
|
||||
// The flush_lut applies the function f(x) = x & 1, which extracts
|
||||
// the least significant bit of the sum. The result is written
|
||||
// directly into the state buffer, updating the IV in-place.
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &a_i_vec, sum_plus_carry_vec, bsks, ksks, mem->luts->flush_lut,
|
||||
num_aes_inputs);
|
||||
}
|
||||
@@ -1221,9 +1221,9 @@ __host__ void host_integer_key_expansion(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI first_byte_bit_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&first_byte_bit_slice,
|
||||
&rotated_word_buffer, bit, bit + 1);
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, &first_byte_bit_slice, mem->params.message_modulus,
|
||||
mem->params.carry_modulus);
|
||||
host_add_scalar_one_inplace<Torus>(streams, &first_byte_bit_slice,
|
||||
mem->params.message_modulus,
|
||||
mem->params.carry_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "integer/abs.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
@@ -14,19 +14,19 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_abs_kb<uint64_t>(
|
||||
return scratch_cuda_integer_abs<uint64_t>(
|
||||
CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
|
||||
num_blocks, params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
|
||||
bool is_signed, void *const *bsks, void *const *ksks) {
|
||||
|
||||
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
|
||||
(uint64_t **)(ksks), mem, is_signed);
|
||||
host_integer_abs<uint64_t>(CudaStreams(streams), ct, bsks,
|
||||
(uint64_t **)(ksks), mem, is_signed);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
|
||||
|
||||
@@ -10,9 +10,11 @@
|
||||
#include "radix_ciphertext.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_abs_kb(
|
||||
CudaStreams streams, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
__host__ uint64_t scratch_cuda_integer_abs(CudaStreams streams,
|
||||
int_abs_buffer<Torus> **mem_ptr,
|
||||
bool is_signed, uint32_t num_blocks,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
if (is_signed) {
|
||||
@@ -23,10 +25,10 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
|
||||
__host__ void host_integer_abs(CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
int_abs_buffer<uint64_t> *mem_ptr,
|
||||
bool is_signed) {
|
||||
if (!is_signed)
|
||||
return;
|
||||
|
||||
@@ -39,7 +41,7 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
mask, ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, mask, num_bits_in_ciphertext - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
|
||||
@@ -52,8 +54,7 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
mem_ptr->scp_mem, bsks, ksks,
|
||||
requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
|
||||
bsks, ksks);
|
||||
host_bitop<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem, bsks, ksks);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_ABS_CUH
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_bitop_kb_64(
|
||||
uint64_t scratch_cuda_bitop_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -14,20 +14,21 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
|
||||
return scratch_cuda_bitop<uint64_t>(
|
||||
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
|
||||
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
|
||||
host_bitop<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
|
||||
lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
@@ -10,11 +10,12 @@
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_bitop_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
__host__ void host_bitop(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
|
||||
@@ -41,7 +42,7 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
lwe_array_1->num_radix_blocks);
|
||||
}
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, lut,
|
||||
lwe_array_out->num_radix_blocks, lut->params.message_modulus);
|
||||
|
||||
@@ -50,10 +51,11 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
|
||||
CudaStreams streams, int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
__host__ uint64_t scratch_cuda_bitop(CudaStreams streams,
|
||||
int_bitop_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks,
|
||||
int_radix_params params, BITOP_TYPE op,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,
|
||||
|
||||
@@ -78,8 +78,8 @@ __host__ void host_extend_radix_with_sign_msb(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->last_block, 0, 1, input,
|
||||
input_blocks - 1, input_blocks);
|
||||
|
||||
host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
|
||||
host_apply_univariate_lut(streams, mem_ptr->padding_block,
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
|
||||
|
||||
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
|
||||
uint32_t dst_block_idx = input_blocks + i;
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
#include "integer/cmux.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_cmux_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PUSH_RANGE("scratch cmux")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -17,29 +19,29 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
|
||||
std::function<uint64_t(uint64_t)> predicate_lut_f =
|
||||
[](uint64_t x) -> uint64_t { return x == 1; };
|
||||
|
||||
uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
|
||||
uint64_t ret = scratch_cuda_cmux<uint64_t>(
|
||||
CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
|
||||
predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
return ret;
|
||||
}
|
||||
|
||||
void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PUSH_RANGE("cmux")
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
|
||||
lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
host_cmux<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_condition,
|
||||
lwe_array_true, lwe_array_false,
|
||||
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup cmux")
|
||||
int_cmux_buffer<uint64_t> *mem_ptr =
|
||||
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -28,7 +28,7 @@ __host__ void zero_out_if(CudaStreams streams,
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
auto params = mem_ptr->params;
|
||||
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
|
||||
// We can't use integer_radix_apply_bivariate_lookup_table since the
|
||||
// second operand is not an array
|
||||
auto tmp_lwe_array_input = mem_ptr->tmp;
|
||||
host_pack_bivariate_blocks_with_single_block<Torus>(
|
||||
@@ -36,18 +36,19 @@ __host__ void zero_out_if(CudaStreams streams,
|
||||
lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
|
||||
num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks, predicate,
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_cmux_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
|
||||
__host__ void host_cmux(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
@@ -69,7 +70,7 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->condition_array, i,
|
||||
i + 1, lwe_condition, 0, 1);
|
||||
}
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
|
||||
mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
|
||||
2 * num_radix_blocks, params.message_modulus);
|
||||
@@ -88,16 +89,18 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
&mem_true, &mem_false, num_radix_blocks,
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, &mem_true, bsks, ksks,
|
||||
mem_ptr->message_extract_lut, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
|
||||
CudaStreams streams, int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
|
||||
int_radix_params params, bool allocate_gpu_memory) {
|
||||
__host__ uint64_t scratch_cuda_cmux(CudaStreams streams,
|
||||
int_cmux_buffer<Torus> **mem_ptr,
|
||||
std::function<Torus(Torus)> predicate_lut_f,
|
||||
uint32_t num_radix_blocks,
|
||||
int_radix_params params,
|
||||
bool allocate_gpu_memory) {
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "integer/comparison.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
uint64_t scratch_cuda_comparison_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -18,7 +18,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
switch (op_type) {
|
||||
case EQ:
|
||||
case NE:
|
||||
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
size_tracker += scratch_cuda_comparison_check<uint64_t>(
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, op_type, false, allocate_gpu_memory);
|
||||
break;
|
||||
@@ -28,7 +28,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
case LE:
|
||||
case MAX:
|
||||
case MIN:
|
||||
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
size_tracker += scratch_cuda_comparison_check<uint64_t>(
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
|
||||
break;
|
||||
@@ -37,11 +37,12 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
PUSH_RANGE("comparison")
|
||||
if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks must be the same")
|
||||
@@ -54,9 +55,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
switch (buffer->op) {
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
bsks, (uint64_t **)(ksks), num_radix_blocks);
|
||||
host_equality_check<uint64_t>(CudaStreams(streams), lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -65,18 +66,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
if (num_radix_blocks % 2 != 0)
|
||||
PANIC("Cuda error (comparisons): the number of radix blocks has to be "
|
||||
"even.")
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
host_difference_check<uint64_t>(CudaStreams(streams), lwe_array_out,
|
||||
lwe_array_1, lwe_array_2, buffer,
|
||||
buffer->diff_buffer->operator_f, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
if (num_radix_blocks % 2 != 0)
|
||||
PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
bsks, (uint64_t **)(ksks), num_radix_blocks);
|
||||
host_maxmin<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
|
||||
lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
@@ -95,7 +96,7 @@ void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -109,12 +110,12 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
return scratch_cuda_comparison_check<uint64_t>(
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void cuda_integer_are_all_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
@@ -122,7 +123,7 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
|
||||
host_integer_are_all_comparisons_block_true<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
}
|
||||
@@ -137,7 +138,7 @@ void cleanup_cuda_integer_are_all_comparisons_block_true(
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -151,12 +152,12 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
|
||||
return scratch_cuda_comparison_check<uint64_t>(
|
||||
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
@@ -164,7 +165,7 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
|
||||
host_integer_is_at_least_one_comparisons_block_true<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks);
|
||||
}
|
||||
|
||||
@@ -155,7 +155,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
// Applies the LUT
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
|
||||
// Reset max_value_lut_indexes before returning, otherwise if the lut is
|
||||
// reused the lut indexes will be wrong
|
||||
@@ -172,7 +172,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, tmp_out, accumulator, bsks, ksks, lut, num_chunks);
|
||||
}
|
||||
}
|
||||
@@ -241,12 +241,12 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
// Applies the LUT
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
|
||||
lut, 1);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
|
||||
bsks, ksks, lut, num_chunks);
|
||||
}
|
||||
@@ -314,19 +314,19 @@ __host__ void host_compare_blocks_with_zero(
|
||||
}
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, sum, bsks, ksks, zero_comparison, num_sum_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void
|
||||
host_equality_check(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
@@ -335,7 +335,7 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
|
||||
// Applies the LUT for the comparison operation
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
|
||||
eq_buffer->operator_lut, num_radix_blocks,
|
||||
eq_buffer->operator_lut->params.message_modulus);
|
||||
@@ -349,12 +349,12 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void compare_radix_blocks_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void
|
||||
compare_radix_blocks(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -386,15 +386,15 @@ __host__ void compare_radix_blocks_kb(
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, is_non_zero_lut,
|
||||
num_radix_blocks);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, lwe_array_out, message_modulus, carry_modulus);
|
||||
host_add_scalar_one_inplace<Torus>(streams, lwe_array_out, message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
@@ -439,7 +439,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
|
||||
partial_block_count, message_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, x, y, bsks, ksks, inner_tree_leaf, partial_block_count >> 1);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
@@ -485,12 +485,12 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
last_lut->broadcast_lut(active_streams);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, y, bsks, ksks, last_lut, 1);
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,
|
||||
bsks, ksks, last_lut, 1);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
__host__ void host_difference_check(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
@@ -534,7 +534,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
|
||||
// Clean noise
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
|
||||
identity_lut, 2 * packed_num_radix_blocks);
|
||||
} else {
|
||||
@@ -553,15 +553,15 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
if (!mem_ptr->is_signed) {
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, packed_num_radix_blocks);
|
||||
compare_radix_blocks<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr, bsks,
|
||||
ksks, packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, packed_num_radix_blocks);
|
||||
compare_radix_blocks<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -573,7 +573,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_left, lwe_array_left,
|
||||
num_radix_blocks - 2,
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
|
||||
bsks, ksks, identity_lut, 1);
|
||||
|
||||
@@ -586,7 +586,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_right,
|
||||
lwe_array_right, num_radix_blocks - 2,
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &last_right_block_before_sign_block,
|
||||
&shifted_lwe_array_right, bsks, ksks, identity_lut, 1);
|
||||
|
||||
@@ -594,7 +594,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
packed_num_radix_blocks,
|
||||
packed_num_radix_blocks + 1);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
compare_radix_blocks<Torus>(
|
||||
streams, &shifted_comparisons, &last_left_block_before_sign_block,
|
||||
&last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);
|
||||
|
||||
@@ -608,16 +608,16 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
CudaRadixCiphertextFFI last_right_block;
|
||||
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, &shifted_comparisons, &last_left_block, &last_right_block,
|
||||
bsks, ksks, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks,
|
||||
num_radix_blocks - 1);
|
||||
compare_radix_blocks<Torus>(streams, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks,
|
||||
num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
CudaRadixCiphertextFFI shifted_comparisons;
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -628,7 +628,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
CudaRadixCiphertextFFI last_right_block;
|
||||
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, &shifted_comparisons, &last_left_block, &last_right_block,
|
||||
bsks, ksks, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
@@ -645,7 +645,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
|
||||
__host__ uint64_t scratch_cuda_comparison_check(
|
||||
CudaStreams streams, int_comparison_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
|
||||
bool is_signed, bool allocate_gpu_memory) {
|
||||
@@ -658,12 +658,12 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_maxmin_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void
|
||||
host_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -675,18 +675,18 @@ __host__ void host_integer_radix_maxmin_kb(
|
||||
"than the number of blocks to operate on")
|
||||
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
host_difference_check<Torus>(
|
||||
streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
|
||||
host_cmux<Torus>(streams, lwe_array_out, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_are_all_comparisons_block_true_kb(
|
||||
__host__ void host_integer_are_all_comparisons_block_true(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
@@ -699,7 +699,7 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
|
||||
__host__ void host_integer_is_at_least_one_comparisons_block_true(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
|
||||
@@ -13,7 +13,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
|
||||
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
|
||||
return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
return scratch_cuda_compress_ciphertext<uint64_t>(
|
||||
CudaStreams(streams), (int_compression<uint64_t> **)mem_ptr,
|
||||
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
}
|
||||
@@ -93,7 +93,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
|
||||
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
|
||||
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
|
||||
|
||||
return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
|
||||
return scratch_cuda_compress_ciphertext<__uint128_t>(
|
||||
CudaStreams(streams), (int_compression<__uint128_t> **)mem_ptr,
|
||||
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -401,7 +401,7 @@ host_integer_decompress(CudaStreams streams,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
|
||||
__host__ uint64_t scratch_cuda_compress_ciphertext(
|
||||
CudaStreams streams, int_compression<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params compression_params,
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "integer/div_rem.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
|
||||
@@ -14,13 +14,13 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_div_rem_kb<uint64_t>(
|
||||
return scratch_cuda_integer_div_rem<uint64_t>(
|
||||
CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
void cuda_integer_div_rem_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
@@ -28,9 +28,9 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
PUSH_RANGE("div")
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_div_rem_kb<uint64_t>(CudaStreams(streams), quotient, remainder,
|
||||
numerator, divisor, is_signed, bsks,
|
||||
(uint64_t **)(ksks), mem);
|
||||
host_integer_div_rem<uint64_t>(CudaStreams(streams), quotient, remainder,
|
||||
numerator, divisor, is_signed, bsks,
|
||||
(uint64_t **)(ksks), mem);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#include <fstream>
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
__host__ uint64_t scratch_cuda_integer_div_rem(
|
||||
CudaStreams streams, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
|
||||
|
||||
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
__host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
@@ -85,7 +85,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
// Computes 2*d by extending and shifting on gpu[1]
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
|
||||
mem_ptr->d2, divisor_gpu_1, streams.get_ith(1));
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams.get_ith(1), mem_ptr->d2, 1, mem_ptr->shift_mem, &bsks[1],
|
||||
&ksks[1], mem_ptr->d2->num_radix_blocks);
|
||||
|
||||
@@ -250,14 +250,14 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
auto o3 = mem_ptr->sub_1_overflowed;
|
||||
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(streams.get_ith(0), o3, o3, mem_ptr->cmp_1,
|
||||
mem_ptr->bitor_mem_1, &bsks[0], &ksks[0]);
|
||||
host_bitop(streams.get_ith(0), o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
|
||||
&bsks[0], &ksks[0]);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(streams.get_ith(1), o2, o2, mem_ptr->cmp_2,
|
||||
mem_ptr->bitor_mem_2, &bsks[1], &ksks[1]);
|
||||
host_bitop(streams.get_ith(1), o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
|
||||
&bsks[1], &ksks[1]);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(streams.get_ith(2), o1, o1, mem_ptr->cmp_3,
|
||||
mem_ptr->bitor_mem_3, &bsks[2], &ksks[2]);
|
||||
host_bitop(streams.get_ith(2), o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
|
||||
&bsks[2], &ksks[2]);
|
||||
|
||||
// cmp_1, cmp_2, cmp_3 are not needed anymore, we can reuse them as c3,
|
||||
// c2, c1. c0 is allocated on gpu[3], we take it from mem_ptr.
|
||||
@@ -337,7 +337,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
host_add_the_same_block_to_all_blocks<Torus>(streams.stream(gpu_index),
|
||||
streams.gpu_index(gpu_index),
|
||||
rx, rx, cx, 4, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams.get_ith(gpu_index), rx, rx, &bsks[gpu_index],
|
||||
&ksks[gpu_index], lut, rx->num_radix_blocks);
|
||||
};
|
||||
@@ -355,15 +355,15 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
mem_ptr->zero_out_if_not_1_lut_2, 2);
|
||||
|
||||
// calculate quotient bits GPU[2]
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem_ptr->sub_streams_1.get_ith(2), mem_ptr->q1, c1, &bsks[2], &ksks[2],
|
||||
mem_ptr->quotient_lut_1, 1);
|
||||
// calculate quotient bits GPU[1]
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem_ptr->sub_streams_1.get_ith(1), mem_ptr->q2, c2, &bsks[1], &ksks[1],
|
||||
mem_ptr->quotient_lut_2, 1);
|
||||
// calculate quotient bits GPU[0]
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem_ptr->sub_streams_1.get_ith(0), mem_ptr->q3, c3, &bsks[0], &ksks[0],
|
||||
mem_ptr->quotient_lut_3, 1);
|
||||
|
||||
@@ -427,10 +427,10 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
|
||||
streams.synchronize();
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, rem_gpu_0, rem_gpu_0, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_1, rem_gpu_0->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem_ptr->sub_streams_1, q3_gpu_0, q3_gpu_0, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_2, 1);
|
||||
streams.synchronize();
|
||||
@@ -469,7 +469,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb(
|
||||
__host__ void host_unsigned_integer_div_rem(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
@@ -486,7 +486,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
if (mem_ptr->params.message_modulus == 4 &&
|
||||
mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
|
||||
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
|
||||
host_unsigned_integer_div_rem_block_by_block_2_2<Torus>(
|
||||
streams, quotient, remainder, numerator, divisor, bsks, ksks,
|
||||
mem_ptr->div_rem_2_2_mem);
|
||||
return;
|
||||
@@ -587,7 +587,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
&last_interesting_divisor_block, interesting_divisor,
|
||||
interesting_divisor->num_radix_blocks - 1,
|
||||
interesting_divisor->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &last_interesting_divisor_block,
|
||||
&last_interesting_divisor_block, bsks, ksks,
|
||||
mem_ptr->masking_luts_1[shifted_mask], 1);
|
||||
@@ -614,7 +614,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// the estimated degree of the output is < msg_modulus
|
||||
shifted_mask = shifted_mask & full_message_mask;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
|
||||
mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
@@ -636,7 +636,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
|
||||
interesting_remainder1, 0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
|
||||
@@ -665,7 +665,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
}; // left_shift_interesting_remainder1
|
||||
|
||||
auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
|
||||
interesting_remainder2->num_radix_blocks);
|
||||
}; // left_shift_interesting_remainder2
|
||||
@@ -773,7 +773,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// fills:
|
||||
// `cleaned_merged_interesting_remainder` - radix ciphertext
|
||||
auto create_clean_version_of_merged_remainder = [&](CudaStreams streams) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_1,
|
||||
@@ -811,7 +811,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
auto conditionally_zero_out_merged_interesting_remainder =
|
||||
[&](CudaStreams streams) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
|
||||
ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
@@ -820,7 +820,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
auto conditionally_zero_out_merged_new_remainder =
|
||||
[&](CudaStreams streams) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
|
||||
ksks, mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
|
||||
new_remainder->num_radix_blocks, factor);
|
||||
@@ -828,7 +828,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
auto set_quotient_bit = [&](CudaStreams streams) {
|
||||
uint32_t block_of_bit = i / num_bits_in_message;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, mem_ptr->did_not_overflow, subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, bsks, ksks,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
|
||||
@@ -887,10 +887,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
streams.synchronize();
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_1, num_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_2, num_blocks);
|
||||
|
||||
@@ -899,7 +899,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_div_rem_kb(
|
||||
__host__ void host_integer_div_rem(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
|
||||
@@ -927,15 +927,15 @@ __host__ void host_integer_div_rem_kb(
|
||||
|
||||
streams.synchronize();
|
||||
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
|
||||
bsks, ksks, int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
|
||||
bsks, ksks, int_mem_ptr->abs_mem_2, true);
|
||||
host_integer_abs<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
|
||||
bsks, ksks, int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs<Torus>(int_mem_ptr->sub_streams_2, positive_divisor, bsks,
|
||||
ksks, int_mem_ptr->abs_mem_2, true);
|
||||
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
host_unsigned_integer_div_rem<Torus>(
|
||||
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
|
||||
positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
|
||||
|
||||
@@ -945,7 +945,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
CudaRadixCiphertextFFI divisor_sign;
|
||||
as_radix_ciphertext_slice<Torus>(&divisor_sign, divisor, num_blocks - 1,
|
||||
num_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
|
||||
&numerator_sign, &divisor_sign, bsks, ksks,
|
||||
int_mem_ptr->compare_signed_bits_lut, 1,
|
||||
@@ -954,7 +954,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
host_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, quotient,
|
||||
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
|
||||
|
||||
@@ -965,7 +965,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
nullptr, int_mem_ptr->scp_mem_1, bsks,
|
||||
ksks, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
host_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
|
||||
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
|
||||
|
||||
@@ -974,22 +974,21 @@ __host__ void host_integer_div_rem_kb(
|
||||
nullptr, int_mem_ptr->scp_mem_2, bsks,
|
||||
ksks, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, quotient,
|
||||
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
|
||||
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
|
||||
host_cmux<Torus>(int_mem_ptr->sub_streams_1, quotient,
|
||||
int_mem_ptr->sign_bits_are_different,
|
||||
int_mem_ptr->negated_quotient, quotient,
|
||||
int_mem_ptr->cmux_quotient_mem, bsks, ksks);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
|
||||
int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
|
||||
host_cmux<Torus>(int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
|
||||
int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
|
||||
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
} else {
|
||||
host_unsigned_integer_div_rem_kb<Torus>(streams, quotient, remainder,
|
||||
numerator, divisor, bsks, ksks,
|
||||
int_mem_ptr->unsigned_mem);
|
||||
host_unsigned_integer_div_rem<Torus>(streams, quotient, remainder,
|
||||
numerator, divisor, bsks, ksks,
|
||||
int_mem_ptr->unsigned_mem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "ilog2.cuh"
|
||||
|
||||
uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
uint64_t scratch_integer_count_of_consecutive_bits_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -26,7 +26,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
// the leading or trailing end of an encrypted integer. The final count is
|
||||
// stored in the output ciphertext.
|
||||
//
|
||||
void cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
void cuda_integer_count_of_consecutive_bits_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
@@ -37,8 +37,8 @@ void cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
(uint64_t **)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_count_of_consecutive_bits_buffer<uint64_t> *mem_ptr =
|
||||
(int_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -49,7 +49,7 @@ void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_ilog2_kb_64(
|
||||
uint64_t scratch_integer_ilog2_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -74,7 +74,7 @@ uint64_t scratch_integer_ilog2_kb_64(
|
||||
// This is equivalent to finding the position of the most significant bit.
|
||||
// The result is stored in the output ciphertext.
|
||||
//
|
||||
void cuda_integer_ilog2_kb_64(
|
||||
void cuda_integer_ilog2_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
@@ -88,8 +88,8 @@ void cuda_integer_ilog2_kb_64(
|
||||
(uint64_t **)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_ilog2_buffer<uint64_t> *mem_ptr =
|
||||
(int_ilog2_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -14,8 +14,8 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(
|
||||
|
||||
auto tmp = mem_ptr->tmp_ct;
|
||||
|
||||
host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
|
||||
mem_ptr->univ_lut_mem, ksks, bsks);
|
||||
host_apply_univariate_lut<Torus>(streams, tmp, ciphertext,
|
||||
mem_ptr->univ_lut_mem, ksks, bsks);
|
||||
|
||||
if (mem_ptr->direction == Leading) {
|
||||
host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
|
||||
@@ -72,7 +72,7 @@ __host__ void host_integer_count_of_consecutive_bits(
|
||||
output_start_index + 1, ct_prepared, i, i + 1);
|
||||
}
|
||||
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
host_integer_partial_sum_ciphertexts_vec<Torus>(
|
||||
streams, output_ct, cts, bsks, ksks, mem_ptr->sum_mem, counter_num_blocks,
|
||||
ct_prepared->num_radix_blocks);
|
||||
|
||||
@@ -141,19 +141,19 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
|
||||
// Perform a partial sum of all the elements without carry propagation.
|
||||
//
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
host_integer_partial_sum_ciphertexts_vec<Torus>(
|
||||
streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
|
||||
ksks, mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
|
||||
mem_ptr->input_num_blocks + 1);
|
||||
|
||||
// Apply luts to the partial sum.
|
||||
//
|
||||
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->message_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_message_not, ksks, bsks);
|
||||
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->carry_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_carry_not, ksks, bsks);
|
||||
host_apply_univariate_lut<Torus>(streams, mem_ptr->message_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_message_not, ksks, bsks);
|
||||
host_apply_univariate_lut<Torus>(streams, mem_ptr->carry_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_carry_not, ksks, bsks);
|
||||
|
||||
// Left-shift the bitwise-negated carry blocks by one position.
|
||||
//
|
||||
@@ -190,7 +190,7 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
2 * mem_ptr->counter_num_blocks, 3 * mem_ptr->counter_num_blocks,
|
||||
trivial_ct_2, 0, mem_ptr->counter_num_blocks);
|
||||
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
host_integer_partial_sum_ciphertexts_vec<Torus>(
|
||||
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks, mem_ptr->sum_mem,
|
||||
mem_ptr->counter_num_blocks, 3);
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint64_t scratch_cuda_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -56,12 +56,12 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
|
||||
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -74,12 +74,12 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
|
||||
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -97,7 +97,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
num_blocks, params, compute_overflow, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
void cuda_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
@@ -109,7 +109,7 @@ void cuda_propagate_single_carry_kb_64_inplace(
|
||||
requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
void cuda_add_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
@@ -121,7 +121,7 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
void cuda_integer_overflowing_sub_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
@@ -168,7 +168,7 @@ void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
uint64_t scratch_cuda_apply_univariate_lut_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -182,13 +182,13 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_univariate_lut_kb<uint64_t>(
|
||||
return scratch_cuda_apply_univariate_lut<uint64_t>(
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -202,24 +202,25 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
|
||||
return scratch_cuda_apply_many_univariate_lut<uint64_t>(
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
num_many_lut, lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks) {
|
||||
void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
host_apply_univariate_lut<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup univar lut")
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
@@ -228,19 +229,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
void cuda_apply_many_univariate_lut_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
host_apply_many_univariate_lut_kb<uint64_t>(
|
||||
host_apply_many_univariate_lut<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
@@ -254,27 +255,27 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
return scratch_cuda_apply_bivariate_lut<uint64_t>(
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_bivariate_lut_kb_64(
|
||||
void cuda_apply_bivariate_lut_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
|
||||
host_apply_bivariate_lut_kb<uint64_t>(
|
||||
host_apply_bivariate_lut<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
|
||||
input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
|
||||
(uint64_t **)(ksks), bsks, num_radix_blocks, shift);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup bivar lut")
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
@@ -298,7 +299,7 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
|
||||
return scratch_cuda_apply_bivariate_lut<uint64_t>(
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
@@ -360,7 +361,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
uint64_t scratch_cuda_apply_noise_squashing(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
|
||||
@@ -381,20 +382,21 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
original_num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks) {
|
||||
void cuda_apply_noise_squashing(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
integer_radix_apply_noise_squashing_kb<uint64_t>(
|
||||
integer_radix_apply_noise_squashing<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup noise squashing")
|
||||
int_noise_squashing_lut<uint64_t> *mem_ptr =
|
||||
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -503,7 +503,7 @@ __host__ void host_pack_bivariate_blocks_with_single_block(
|
||||
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
|
||||
/// the input and output numbers of blocks
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
__host__ void integer_radix_apply_univariate_lookup_table(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
|
||||
@@ -607,7 +607,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
__host__ void integer_radix_apply_many_univariate_lookup_table(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
|
||||
@@ -710,7 +710,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
|
||||
@@ -1279,7 +1279,7 @@ void host_compute_shifted_blocks_and_states(
|
||||
auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
|
||||
auto luts_array_first_step = mem->luts_array_first_step;
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_many_univariate_lookup_table<Torus>(
|
||||
streams, shifted_blocks_and_states, lwe_array, bsks, ksks,
|
||||
luts_array_first_step, num_many_lut, lut_stride);
|
||||
|
||||
@@ -1347,7 +1347,7 @@ void host_resolve_group_carries_sequentially(
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_group_resolved_carries,
|
||||
group_resolved_carries, 1,
|
||||
blocks_to_solve + 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, &shifted_group_resolved_carries,
|
||||
&shifted_group_resolved_carries, bsks, ksks, luts_sequential,
|
||||
blocks_to_solve);
|
||||
@@ -1388,7 +1388,7 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
auto prev_blocks = generates_or_propagates;
|
||||
int cur_total_blocks = num_radix_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks, luts,
|
||||
cur_total_blocks, luts->params.message_modulus);
|
||||
|
||||
@@ -1426,11 +1426,11 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
block_states, num_radix_blocks, group_size);
|
||||
|
||||
auto luts_array_second_step = mem->luts_array_second_step;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, propagation_cum_sums, propagation_cum_sums, bsks, ksks,
|
||||
luts_array_second_step, num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<Torus>(
|
||||
host_scalar_addition_inplace<Torus>(
|
||||
streams, propagation_cum_sums, mem->scalar_array_cum_sum,
|
||||
mem->h_scalar_array_cum_sum, num_radix_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
@@ -1478,7 +1478,7 @@ void host_compute_shifted_blocks_and_borrow_states(
|
||||
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
|
||||
auto luts_array_first_step = mem->luts_array_first_step;
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_many_univariate_lookup_table<Torus>(
|
||||
streams, shifted_blocks_and_borrow_states, lwe_array, bsks, ksks,
|
||||
luts_array_first_step, num_many_lut, lut_stride);
|
||||
|
||||
@@ -1682,7 +1682,7 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
|
||||
effective_num_radix_blocks);
|
||||
}
|
||||
@@ -1738,7 +1738,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
|
||||
signs_a, num_sign_blocks, message_modulus);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, signs_a, signs_b, bsks, ksks, lut, num_sign_blocks / 2);
|
||||
|
||||
if (num_sign_blocks % 2 == 1)
|
||||
@@ -1768,7 +1768,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
|
||||
signs_a, num_sign_blocks, message_modulus);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, signs_array_out, signs_b, bsks, ksks, lut, 1);
|
||||
|
||||
} else {
|
||||
@@ -1786,13 +1786,13 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
diff_buffer->preallocated_h_lut2);
|
||||
lut->broadcast_lut(lut->active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb(
|
||||
uint64_t scratch_cuda_apply_univariate_lut(
|
||||
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -1814,19 +1814,19 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_univariate_lut_kb(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks) {
|
||||
void host_apply_univariate_lut(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
|
||||
radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut_kb(
|
||||
uint64_t scratch_cuda_apply_many_univariate_lut(
|
||||
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint32_t num_many_lut,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory) {
|
||||
@@ -1849,19 +1849,21 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_many_univariate_lut_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
void host_apply_many_univariate_lut(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks, void *const *bsks,
|
||||
uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_many_univariate_lookup_table<Torus>(
|
||||
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem, num_many_lut,
|
||||
lut_stride);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_kb(
|
||||
uint64_t scratch_cuda_apply_bivariate_lut(
|
||||
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -1883,21 +1885,21 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_bivariate_lut_kb(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_1,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_2,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
void host_apply_bivariate_lut(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_1,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_2,
|
||||
int_radix_lut<Torus> *mem, Torus *const *ksks,
|
||||
void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
|
||||
num_radix_blocks, shift);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
uint64_t scratch_cuda_propagate_single_carry_inplace(
|
||||
CudaStreams streams, int_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
|
||||
bool allocate_gpu_memory) {
|
||||
@@ -1992,7 +1994,7 @@ void host_propagate_single_carry(CudaStreams streams,
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), prepared_blocks,
|
||||
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, mem->output_flag, prepared_blocks, bsks, ksks,
|
||||
mem->lut_message_extract, num_radix_blocks + 1);
|
||||
|
||||
@@ -2004,7 +2006,7 @@ void host_propagate_single_carry(CudaStreams streams,
|
||||
mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
|
||||
} else {
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array, prepared_blocks, bsks, ksks, message_extract,
|
||||
num_radix_blocks);
|
||||
}
|
||||
@@ -2077,7 +2079,7 @@ void host_add_and_propagate_single_carry(
|
||||
auto block_states = mem->shifted_blocks_state_mem->block_states;
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
|
||||
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, &output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks,
|
||||
lut_overflow_prep, 1, lut_overflow_prep->params.message_modulus);
|
||||
} else if (requested_flag == outputFlag::FLAG_CARRY) {
|
||||
@@ -2140,7 +2142,7 @@ void host_add_and_propagate_single_carry(
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), prepared_blocks,
|
||||
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, mem->output_flag, prepared_blocks, bsks, ksks,
|
||||
mem->lut_message_extract, num_radix_blocks + 1);
|
||||
|
||||
@@ -2152,7 +2154,7 @@ void host_add_and_propagate_single_carry(
|
||||
streams.stream(0), streams.gpu_index(0), carry_out, 0, 1,
|
||||
mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lhs_array, prepared_blocks, bsks, ksks,
|
||||
mem->lut_message_extract, num_radix_blocks);
|
||||
}
|
||||
@@ -2227,8 +2229,8 @@ void host_single_borrow_propagate(CudaStreams streams,
|
||||
(Torus *)prepared_blocks->ptr, shifted_blocks,
|
||||
simulators, big_lwe_dimension, num_radix_blocks);
|
||||
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, prepared_blocks, message_modulus, carry_modulus);
|
||||
host_add_scalar_one_inplace<Torus>(streams, prepared_blocks, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
|
||||
CudaRadixCiphertextFFI shifted_simulators;
|
||||
@@ -2268,7 +2270,7 @@ void host_single_borrow_propagate(CudaStreams streams,
|
||||
|
||||
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
|
||||
auto borrow_flag = mem->lut_borrow_flag;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
|
||||
borrow_flag, 1);
|
||||
}
|
||||
@@ -2290,7 +2292,7 @@ void host_single_borrow_propagate(CudaStreams streams,
|
||||
mem->group_size);
|
||||
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
|
||||
message_extract, num_radix_blocks);
|
||||
|
||||
@@ -2308,7 +2310,7 @@ void host_single_borrow_propagate(CudaStreams streams,
|
||||
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
|
||||
/// the input and output numbers of blocks
|
||||
template <typename InputTorus>
|
||||
__host__ void integer_radix_apply_noise_squashing_kb(
|
||||
__host__ void integer_radix_apply_noise_squashing(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
|
||||
|
||||
@@ -65,7 +65,7 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the integer radix multiplication in keyswitch->bootstrap order.
|
||||
*/
|
||||
uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
|
||||
bool const is_boolean_right, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
@@ -87,7 +87,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
case 4096:
|
||||
case 8192:
|
||||
case 16384:
|
||||
return scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
|
||||
return scratch_cuda_integer_mult_radix_ciphertext<uint64_t>(
|
||||
CudaStreams(streams), (int_mul_memory<uint64_t> **)mem_ptr,
|
||||
is_boolean_left, is_boolean_right, num_radix_blocks, params,
|
||||
allocate_gpu_memory);
|
||||
@@ -124,7 +124,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
* ciphertext
|
||||
* - 'pbs_type' selects which PBS implementation should be used
|
||||
*/
|
||||
void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
void cuda_integer_mult_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
@@ -133,43 +133,43 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
PUSH_RANGE("mul")
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
@@ -192,7 +192,7 @@ void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -207,30 +207,31 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
return scratch_cuda_integer_partial_sum_ciphertexts_vec<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
|
||||
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
host_integer_partial_sum_ciphertexts_vec<uint64_t>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
|
||||
(uint64_t **)(ksks), mem, radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
|
||||
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
|
||||
@@ -268,7 +268,7 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec(
|
||||
CudaStreams streams, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
||||
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
|
||||
@@ -283,7 +283,7 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
@@ -412,7 +412,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
luts_message_carry->broadcast_lut(active_streams, false);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, current_blocks, current_blocks, bsks, ksks,
|
||||
luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
@@ -463,7 +463,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
luts_message_carry->broadcast_lut(active_streams, false);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
active_streams, current_blocks, radix_lwe_out, bsks, ksks,
|
||||
luts_message_carry, num_blocks_in_apply_lut);
|
||||
}
|
||||
@@ -483,7 +483,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_mult_radix_kb(
|
||||
__host__ void host_integer_mult_radix(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
@@ -580,7 +580,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
(Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
|
||||
luts_array, total_block_count, luts_array->params.message_modulus);
|
||||
|
||||
@@ -608,7 +608,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
size_t b_id = i % num_blocks;
|
||||
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
host_integer_partial_sum_ciphertexts_vec<Torus>(
|
||||
streams, radix_lwe_out, vector_result_sb, bsks, ksks,
|
||||
mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
|
||||
|
||||
@@ -621,7 +621,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
|
||||
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext(
|
||||
CudaStreams streams, int_mul_memory<Torus> **mem_ptr,
|
||||
bool const is_boolean_left, bool const is_boolean_right,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#include "integer/negation.cuh"
|
||||
|
||||
void cuda_negate_integer_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, uint32_t num_radix_blocks) {
|
||||
void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
host_integer_radix_negation<uint64_t>(CudaStreams(streams), lwe_array_out,
|
||||
lwe_array_in, message_modulus,
|
||||
carry_modulus, num_radix_blocks);
|
||||
host_negation<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
message_modulus, carry_modulus, num_radix_blocks);
|
||||
}
|
||||
|
||||
@@ -17,10 +17,9 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
device_integer_radix_negation(Torus *output, Torus const *input,
|
||||
int32_t num_blocks, uint64_t lwe_dimension,
|
||||
uint64_t message_modulus, uint64_t delta) {
|
||||
__global__ void device_negation(Torus *output, Torus const *input,
|
||||
int32_t num_blocks, uint64_t lwe_dimension,
|
||||
uint64_t message_modulus, uint64_t delta) {
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < lwe_dimension + 1) {
|
||||
bool is_body = (tid == lwe_dimension);
|
||||
@@ -49,10 +48,11 @@ device_integer_radix_negation(Torus *output, Torus const *input,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_negation(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint32_t num_radix_blocks) {
|
||||
__host__ void host_negation(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
uint64_t message_modulus, uint64_t carry_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
||||
@@ -80,7 +80,7 @@ __host__ void host_integer_radix_negation(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
|
||||
device_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
|
||||
static_cast<Torus *>(lwe_array_out->ptr),
|
||||
static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
|
||||
message_modulus, delta);
|
||||
|
||||
@@ -21,11 +21,11 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process,
|
||||
int8_t *mem, void *const *bsks) {
|
||||
void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process, int8_t *mem,
|
||||
void *const *bsks) {
|
||||
|
||||
host_integer_grouped_oprf<uint64_t>(
|
||||
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#include "integer/scalar_addition.cuh"
|
||||
|
||||
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
void cuda_scalar_addition_ciphertext_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<uint64_t>(
|
||||
host_scalar_addition_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array,
|
||||
static_cast<const uint64_t *>(scalar_input),
|
||||
static_cast<const uint64_t *>(h_scalar_input), num_scalars,
|
||||
|
||||
@@ -12,9 +12,10 @@
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_addition_inplace(
|
||||
Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
__global__ void
|
||||
device_scalar_addition_inplace(Torus *lwe_array, Torus const *scalar_input,
|
||||
int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
@@ -24,7 +25,7 @@ __global__ void device_integer_radix_scalar_addition_inplace(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_addition_inplace(
|
||||
__host__ void host_scalar_addition_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
Torus const *scalar_input, Torus const *h_scalar_input,
|
||||
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
@@ -45,10 +46,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_addition_inplace<Torus>
|
||||
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
|
||||
scalar_input, num_scalars,
|
||||
lwe_array->lwe_dimension, delta);
|
||||
device_scalar_addition_inplace<Torus><<<grid, thds, 0, streams.stream(0)>>>(
|
||||
(Torus *)lwe_array->ptr, scalar_input, num_scalars,
|
||||
lwe_array->lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
for (uint i = 0; i < num_scalars; i++) {
|
||||
lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
|
||||
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
__global__ void
|
||||
device_add_scalar_one_inplace(Torus *lwe_array, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
@@ -68,9 +68,10 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
__host__ void host_add_scalar_one_inplace(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t message_modulus,
|
||||
uint32_t carry_modulus) {
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
@@ -85,10 +86,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_add_scalar_one_inplace<Torus>
|
||||
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
|
||||
lwe_array->num_radix_blocks,
|
||||
lwe_array->lwe_dimension, delta);
|
||||
device_add_scalar_one_inplace<Torus><<<grid, thds, 0, streams.stream(0)>>>(
|
||||
(Torus *)lwe_array->ptr, lwe_array->num_radix_blocks,
|
||||
lwe_array->lwe_dimension, delta);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
for (uint i = 0; i < lwe_array->num_radix_blocks; i++) {
|
||||
lwe_array->degrees[i] = lwe_array->degrees[i] + 1;
|
||||
@@ -96,9 +96,10 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
|
||||
uint32_t lwe_dimension, uint64_t delta) {
|
||||
__global__ void
|
||||
device_scalar_subtraction_inplace(Torus *lwe_array, Torus *scalar_input,
|
||||
int32_t num_blocks, uint32_t lwe_dimension,
|
||||
uint64_t delta) {
|
||||
|
||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid < num_blocks) {
|
||||
@@ -110,7 +111,7 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
__host__ void host_scalar_subtraction_inplace(
|
||||
CudaStreams streams, Torus *lwe_array, Torus *scalar_input,
|
||||
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t message_modulus, uint32_t carry_modulus) {
|
||||
@@ -128,7 +129,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
|
||||
// this
|
||||
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
|
||||
|
||||
device_integer_radix_scalar_subtraction_inplace<Torus>
|
||||
device_scalar_subtraction_inplace<Torus>
|
||||
<<<grid, thds, 0, streams.stream(0)>>>(lwe_array, scalar_input,
|
||||
input_lwe_ciphertext_count,
|
||||
lwe_dimension, delta);
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include "integer/scalar_bitops.cuh"
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
void cuda_scalar_bitop_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
host_scalar_bitop<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_input,
|
||||
static_cast<const uint64_t *>(clear_blocks),
|
||||
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
|
||||
|
||||
@@ -4,11 +4,12 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_bitop_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
|
||||
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
|
||||
__host__ void
|
||||
host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
Torus const *clear_blocks, Torus const *h_clear_blocks,
|
||||
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
if (output->num_radix_blocks != input->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be equal")
|
||||
@@ -47,7 +48,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
auto active_streams = streams.active_gpu_subset(num_clear_blocks);
|
||||
lut->broadcast_lut(active_streams, false);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, output, input, bsks, ksks, lut, num_clear_blocks);
|
||||
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare) {
|
||||
return {invert_operands, invert_subtraction_result};
|
||||
}
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
void cuda_scalar_comparison_ciphertext_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
@@ -46,7 +46,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
switch (buffer->op) {
|
||||
case EQ:
|
||||
case NE:
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
host_scalar_equality_check<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
|
||||
@@ -58,7 +58,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
if (num_radix_blocks % 2 != 0 && num_radix_blocks != 1)
|
||||
PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
|
||||
"to be even or equal to 1.")
|
||||
host_integer_radix_scalar_difference_check_kb<uint64_t>(
|
||||
host_scalar_difference_check<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
|
||||
@@ -70,7 +70,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
if (lwe_array_in->num_radix_blocks % 2 != 0)
|
||||
PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
|
||||
"even.")
|
||||
host_integer_radix_scalar_maxmin_kb<uint64_t>(
|
||||
host_scalar_maxmin<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
|
||||
|
||||
@@ -25,11 +25,13 @@ Torus is_x_less_than_y_given_input_borrow(Torus last_x_block,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks) {
|
||||
__host__ void scalar_compare_radix_blocks(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_array_in,
|
||||
Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
@@ -62,13 +64,13 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
subtracted_blocks, lwe_array_in);
|
||||
// Subtract
|
||||
// Here we need the true lwe sub, not the one that comes from shortint.
|
||||
host_integer_radix_scalar_subtraction_inplace<Torus>(
|
||||
host_scalar_subtraction_inplace<Torus>(
|
||||
streams, (Torus *)subtracted_blocks->ptr, scalar_blocks,
|
||||
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
|
||||
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, subtracted_blocks, bsks, ksks, sign_lut,
|
||||
num_radix_blocks);
|
||||
|
||||
@@ -78,12 +80,12 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
|
||||
host_integer_radix_add_scalar_one_inplace<Torus>(
|
||||
streams, lwe_array_out, message_modulus, carry_modulus);
|
||||
host_add_scalar_one_inplace<Torus>(streams, lwe_array_out, message_modulus,
|
||||
carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
@@ -148,7 +150,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
@@ -199,7 +201,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
scalar_compare_radix_blocks<Torus>(
|
||||
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
|
||||
@@ -242,7 +244,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
|
||||
ksks, lut, 1, lut->params.message_modulus);
|
||||
|
||||
@@ -276,7 +278,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
|
||||
one_block_lut->release(streams);
|
||||
delete one_block_lut;
|
||||
@@ -305,7 +307,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
scalar_compare_radix_blocks<Torus>(
|
||||
streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
|
||||
@@ -321,7 +323,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
__host__ void integer_radix_signed_scalar_difference_check(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
@@ -420,7 +422,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
|
||||
1, lut->params.message_modulus);
|
||||
|
||||
@@ -466,7 +468,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
scalar_compare_radix_blocks<Torus>(
|
||||
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
|
||||
@@ -525,7 +527,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
|
||||
ksks, signed_msb_lut, 1, signed_msb_lut->params.message_modulus);
|
||||
lsb_streams.synchronize();
|
||||
@@ -568,7 +570,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
|
||||
one_block_lut->release(streams);
|
||||
delete one_block_lut;
|
||||
@@ -606,7 +608,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 0 if lhs < rhs
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
scalar_compare_radix_blocks<Torus>(
|
||||
lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
CudaRadixCiphertextFFI encrypted_sign_block;
|
||||
@@ -622,7 +624,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
scalar_sign_block, h_scalar_sign_block, 1, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
|
||||
trivial_sign_block, bsks, ksks, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
@@ -639,7 +641,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
__host__ void host_scalar_difference_check(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
@@ -654,12 +656,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
|
||||
if (mem_ptr->is_signed) {
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
integer_radix_signed_scalar_difference_check<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
|
||||
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
|
||||
integer_radix_unsigned_scalar_difference_check<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
|
||||
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
@@ -667,12 +669,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks) {
|
||||
__host__ void
|
||||
host_scalar_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
Torus const *scalar_blocks, Torus const *h_scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -688,7 +691,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
// - 1 if lhs == rhs
|
||||
// - 2 if lhs > rhs
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb<Torus>(
|
||||
host_scalar_difference_check<Torus>(
|
||||
streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
|
||||
mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks, num_scalar_blocks);
|
||||
|
||||
@@ -704,13 +707,13 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
|
||||
host_cmux<Torus>(streams, lwe_array_out, mem_ptr->tmp_lwe_array_out,
|
||||
lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
__host__ void host_scalar_equality_check(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
@@ -785,7 +788,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
// We use false cause we only will broadcast the indexes
|
||||
scalar_comparison_luts->broadcast_lut(active_streams, false);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
|
||||
bsks, ksks, scalar_comparison_luts, num_halved_lsb_radix_blocks);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "scalar_div.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -21,7 +21,7 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
scalar_divisor_ffi, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
@@ -32,8 +32,8 @@ void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
scalar_divisor_ffi);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -44,7 +44,7 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -59,25 +59,25 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
return scratch_integer_signed_scalar_div_radix<uint64_t>(
|
||||
CudaStreams(streams), params,
|
||||
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
|
||||
scalar_divisor_ffi, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
host_integer_signed_scalar_div_radix<uint64_t>(
|
||||
CudaStreams(streams), numerator_ct,
|
||||
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
scalar_divisor_ffi, numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
|
||||
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -88,7 +88,7 @@ void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -110,7 +110,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
@@ -127,7 +127,7 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
(uint64_t *)h_clear_blocks, num_clear_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
|
||||
@@ -139,7 +139,7 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -161,7 +161,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
@@ -176,7 +176,7 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
decomposed_divisor, num_scalars_divisor, numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
|
||||
|
||||
@@ -35,7 +35,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
@@ -63,15 +63,15 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_cpy, numerator_ct);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks, bsks,
|
||||
scalar_divisor_ffi);
|
||||
host_scalar_mul_high<Torus>(streams, numerator_cpy,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, bsks,
|
||||
scalar_divisor_ffi);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, numerator_ct->num_radix_blocks);
|
||||
|
||||
@@ -79,7 +79,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
@@ -87,23 +87,23 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
return;
|
||||
}
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_pre,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(streams, numerator_ct,
|
||||
mem_ptr->scalar_mul_high_mem,
|
||||
ksks, bsks, scalar_divisor_ffi);
|
||||
host_scalar_mul_high<Torus>(streams, numerator_ct,
|
||||
mem_ptr->scalar_mul_high_mem, ksks, bsks,
|
||||
scalar_divisor_ffi);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
|
||||
__host__ uint64_t scratch_integer_signed_scalar_div_radix(
|
||||
CudaStreams streams, int_radix_params params,
|
||||
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
@@ -119,7 +119,7 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
__host__ void host_integer_signed_scalar_div_radix(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
@@ -129,7 +129,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
host_negation<Torus>(
|
||||
streams, tmp, numerator_ct, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
|
||||
|
||||
@@ -152,11 +152,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp, numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, tmp,
|
||||
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
|
||||
@@ -165,7 +165,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
@@ -173,11 +173,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp, numerator_ct);
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
|
||||
bsks);
|
||||
host_signed_scalar_mul_high<Torus>(streams, tmp,
|
||||
mem_ptr->scalar_mul_high_mem, ksks,
|
||||
scalar_divisor_ffi, bsks);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
@@ -185,7 +185,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
xsign, numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
@@ -198,15 +198,15 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
tmp, numerator_ct);
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
|
||||
bsks);
|
||||
host_signed_scalar_mul_high<Torus>(streams, tmp,
|
||||
mem_ptr->scalar_mul_high_mem, ksks,
|
||||
scalar_divisor_ffi, bsks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
@@ -214,7 +214,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
xsign, numerator_ct);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
host_arithmetic_scalar_shift_inplace<Torus>(
|
||||
streams, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
@@ -224,7 +224,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
host_integer_radix_negation<Torus>(
|
||||
host_negation<Torus>(
|
||||
streams, numerator_ct, tmp, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
|
||||
} else {
|
||||
@@ -270,9 +270,9 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
host_integer_radix_scalar_bitop_kb(
|
||||
streams, remainder_ct, remainder_ct, clear_blocks, h_clear_blocks,
|
||||
num_clear_blocks, mem_ptr->bitop_mem, bsks, ksks);
|
||||
host_scalar_bitop(streams, remainder_ct, remainder_ct, clear_blocks,
|
||||
h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
|
||||
bsks, ksks);
|
||||
|
||||
} else {
|
||||
if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
@@ -328,9 +328,9 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, quotient_ct);
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb(streams, quotient_ct,
|
||||
mem_ptr->signed_div_mem, bsks, ksks,
|
||||
scalar_divisor_ffi, numerator_bits);
|
||||
host_integer_signed_scalar_div_radix(streams, quotient_ct,
|
||||
mem_ptr->signed_div_mem, bsks, ksks,
|
||||
scalar_divisor_ffi, numerator_bits);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, quotient_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE,
|
||||
@@ -341,10 +341,10 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, quotient_ct);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
remainder_ct->num_radix_blocks);
|
||||
host_logical_scalar_shift_inplace(streams, remainder_ct,
|
||||
scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks,
|
||||
ksks, remainder_ct->num_radix_blocks);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "integer/scalar_mul.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
uint64_t scratch_cuda_integer_scalar_mul_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
@@ -14,12 +14,12 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
|
||||
return scratch_cuda_scalar_mul<uint64_t>(
|
||||
CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, num_scalar_bits, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
void cuda_scalar_multiplication_ciphertext_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks, uint32_t polynomial_size,
|
||||
@@ -31,8 +31,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
(uint64_t **)(ksks), message_modulus, num_scalars);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_scalar_mul_buffer<uint64_t> *mem_ptr =
|
||||
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -30,10 +30,12 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
CudaStreams streams, int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
|
||||
__host__ uint64_t scratch_cuda_scalar_mul(CudaStreams streams,
|
||||
int_scalar_mul_buffer<T> **mem_ptr,
|
||||
uint32_t num_radix_blocks,
|
||||
int_radix_params params,
|
||||
uint32_t num_scalar_bits,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_scalar_mul_buffer<T>(streams, params, num_radix_blocks,
|
||||
@@ -67,9 +69,9 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
copy_radix_ciphertext_slice_async<T>(
|
||||
streams.stream(0), streams.gpu_index(0), &shift_input, 0,
|
||||
num_radix_blocks, lwe_array, 0, num_radix_blocks);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
|
||||
streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
host_logical_scalar_shift_inplace<T>(streams, &shift_input, shift_amount,
|
||||
mem->logical_scalar_shift_buffer,
|
||||
bsks, ksks, num_radix_blocks);
|
||||
} else {
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<T>(
|
||||
@@ -111,7 +113,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
streams.gpu_index(0), lwe_array, 0,
|
||||
num_radix_blocks);
|
||||
} else {
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T>(
|
||||
host_integer_partial_sum_ciphertexts_vec<T>(
|
||||
streams, lwe_array, all_shifted_buffer, bsks, ksks,
|
||||
mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
|
||||
|
||||
@@ -166,10 +168,11 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
__host__ void
|
||||
host_scalar_mul_high(CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_scalar_mul_high_buffer<Torus> *mem_ptr,
|
||||
Torus *const *ksks, void *const *bsks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -186,7 +189,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
tmp_ffi->num_radix_blocks != 0) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
tmp_ffi->num_radix_blocks);
|
||||
@@ -205,7 +208,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
__host__ void host_signed_scalar_mul_high(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
|
||||
@@ -227,7 +230,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
tmp_ffi->num_radix_blocks != 0) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
host_logical_scalar_shift_inplace<Torus>(
|
||||
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
tmp_ffi->num_radix_blocks);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "scalar_rotate.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
uint64_t scratch_cuda_scalar_rotate_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -14,24 +14,24 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
|
||||
return scratch_cuda_scalar_rotate<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t n, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
host_scalar_rotate_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, n,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
__host__ uint64_t scratch_cuda_scalar_rotate(
|
||||
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
@@ -22,10 +22,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
__host__ void
|
||||
host_scalar_rotate_inplace(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
auto num_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -68,7 +69,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
|
||||
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
|
||||
|
||||
@@ -92,7 +93,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
|
||||
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "scalar_shifts.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
uint64_t scratch_cuda_logical_scalar_shift_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -14,7 +14,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
|
||||
return scratch_cuda_logical_scalar_shift<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
|
||||
shift_type, allocate_gpu_memory);
|
||||
@@ -24,17 +24,19 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
/// for the left scalar shift. It is constituted of a rotation, followed by
|
||||
/// the application of a PBS onto the rotated blocks up to num_blocks -
|
||||
/// rotations - 1 The remaining blocks are padded with zeros
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
|
||||
host_logical_scalar_shift_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, shift,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), lwe_array->num_radix_blocks);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
uint64_t scratch_cuda_arithmetic_scalar_shift_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -48,7 +50,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
|
||||
return scratch_cuda_arithmetic_scalar_shift<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
|
||||
params, shift_type, allocate_gpu_memory);
|
||||
@@ -61,18 +63,20 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
/// sign block, and a second PBS is also applied to it to compute the padding
|
||||
/// block, which is copied onto all remaining blocks instead of padding with
|
||||
/// zeros as would be done in the logical shift.
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
uint32_t shift, int8_t *mem_ptr,
|
||||
void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
|
||||
host_arithmetic_scalar_shift_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, shift,
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
@@ -82,8 +86,8 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "pbs/programmable_bootstrap_multibit.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
__host__ uint64_t scratch_cuda_logical_scalar_shift(
|
||||
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
@@ -23,7 +23,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
__host__ void host_logical_scalar_shift_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks, uint32_t num_blocks) {
|
||||
@@ -75,7 +75,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, &partial_current_blocks, &partial_current_blocks,
|
||||
&partial_previous_blocks, bsks, ksks, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
@@ -106,7 +106,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, partial_current_blocks, partial_current_blocks,
|
||||
&partial_next_blocks, bsks, ksks, lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
@@ -114,7 +114,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
__host__ uint64_t scratch_cuda_arithmetic_scalar_shift(
|
||||
CudaStreams streams, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
|
||||
@@ -127,7 +127,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
__host__ void host_arithmetic_scalar_shift_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
@@ -197,7 +197,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
size_t partial_block_count = num_blocks - rotations;
|
||||
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, partial_current_blocks, partial_current_blocks,
|
||||
&partial_next_blocks, bsks, ksks, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
@@ -207,7 +207,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
streams.synchronize();
|
||||
auto lut_univariate_padding_block =
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
|
||||
lut_univariate_padding_block, 1);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
@@ -221,7 +221,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
if (shift_within_block != 0) {
|
||||
auto lut_univariate_shift_last_block =
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
|
||||
lut_univariate_shift_last_block, 1);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "shift_and_rotate.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
uint64_t scratch_cuda_shift_and_rotate_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -14,24 +14,25 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
|
||||
return scratch_cuda_shift_and_rotate<uint64_t>(
|
||||
CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, shift_type, is_signed, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
|
||||
host_shift_and_rotate_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, lwe_shift,
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "scalar_mul.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
__host__ uint64_t scratch_cuda_shift_and_rotate(
|
||||
CudaStreams streams, int_shift_and_rotate_buffer<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params,
|
||||
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
|
||||
@@ -23,11 +23,12 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
__host__ void
|
||||
host_shift_and_rotate_inplace(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
|
||||
@@ -158,7 +159,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, input_bits_a, mux_inputs, bsks, ksks, mux_lut, total_nb_bits);
|
||||
}
|
||||
|
||||
@@ -190,7 +191,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
|
||||
// To give back a clean ciphertext
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, lwe_array, lwe_array, bsks, ksks, cleaning_lut,
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#include "subtraction.cuh"
|
||||
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
@@ -19,7 +19,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
num_blocks, params, requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
void cuda_sub_and_propagate_single_carry_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
|
||||
@@ -36,9 +36,9 @@ void host_sub_and_propagate_single_carry(
|
||||
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
streams, mem->neg_rhs_array, rhs_array, mem->params.message_modulus,
|
||||
mem->params.carry_modulus, mem->neg_rhs_array->num_radix_blocks);
|
||||
host_negation<Torus>(streams, mem->neg_rhs_array, rhs_array,
|
||||
mem->params.message_modulus, mem->params.carry_modulus,
|
||||
mem->neg_rhs_array->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
|
||||
@@ -46,11 +46,12 @@ void host_sub_and_propagate_single_carry(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_radix_subtraction(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_2, uint64_t message_modulus,
|
||||
uint64_t carry_modulus, uint32_t num_radix_blocks) {
|
||||
__host__ void host_subtraction(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_2,
|
||||
uint64_t message_modulus, uint64_t carry_modulus,
|
||||
uint32_t num_radix_blocks) {
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
|
||||
@@ -64,16 +65,15 @@ __host__ void host_integer_radix_subtraction(
|
||||
PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
|
||||
"the same")
|
||||
|
||||
host_integer_radix_negation<Torus>(streams, lwe_array_out, lwe_array_in_2,
|
||||
message_modulus, carry_modulus,
|
||||
num_radix_blocks);
|
||||
host_negation<Torus>(streams, lwe_array_out, lwe_array_in_2, message_modulus,
|
||||
carry_modulus, num_radix_blocks);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), lwe_array_out,
|
||||
lwe_array_out, lwe_array_in_1, num_radix_blocks,
|
||||
message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
|
||||
__host__ uint64_t scratch_cuda_integer_overflowing_sub(
|
||||
CudaStreams streams, int_overflowing_sub_memory<Torus> **mem_ptr,
|
||||
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
@@ -94,7 +94,7 @@ __host__ void host_expand_without_verification(
|
||||
into_radix_ciphertext(output, lwe_array_out, 2 * num_lwes, lwe_dimension);
|
||||
auto input = new CudaRadixCiphertextFFI;
|
||||
into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, output, input, bsks, ksks, message_and_carry_extract_luts,
|
||||
2 * num_lwes);
|
||||
}
|
||||
|
||||
@@ -302,7 +302,7 @@ const _: () = {
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_univariate_lut_kb_64(
|
||||
pub fn scratch_cuda_apply_univariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
input_lut: *const ffi::c_void,
|
||||
@@ -324,7 +324,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
pub fn scratch_cuda_apply_many_univariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
input_lut: *const ffi::c_void,
|
||||
@@ -347,7 +347,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_apply_univariate_lut_kb_64(
|
||||
pub fn cuda_apply_univariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
@@ -357,13 +357,13 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_apply_univariate_lut_kb_64(
|
||||
pub fn cleanup_cuda_apply_univariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
pub fn scratch_cuda_apply_bivariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
input_lut: *const ffi::c_void,
|
||||
@@ -385,7 +385,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_apply_bivariate_lut_kb_64(
|
||||
pub fn cuda_apply_bivariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
input_radix_lwe_1: *const CudaRadixCiphertextFFI,
|
||||
@@ -398,13 +398,10 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
pub fn cleanup_cuda_apply_bivariate_lut_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_apply_many_univariate_lut_kb_64(
|
||||
pub fn cuda_apply_many_univariate_lut_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
@@ -448,7 +445,7 @@ unsafe extern "C" {
|
||||
pub fn cleanup_cuda_full_propagation(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
pub fn scratch_cuda_integer_mult_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
is_boolean_left: bool,
|
||||
@@ -470,7 +467,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
pub fn cuda_integer_mult_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
radix_lwe_out: *mut CudaRadixCiphertextFFI,
|
||||
radix_lwe_left: *const CudaRadixCiphertextFFI,
|
||||
@@ -488,7 +485,7 @@ unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_mult(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_negate_integer_radix_ciphertext_64(
|
||||
pub fn cuda_negate_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in: *const CudaRadixCiphertextFFI,
|
||||
@@ -498,7 +495,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
pub fn cuda_scalar_addition_ciphertext_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
scalar_input: *const ffi::c_void,
|
||||
@@ -509,7 +506,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
pub fn scratch_cuda_logical_scalar_shift_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -531,7 +528,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
pub fn cuda_logical_scalar_shift_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
shift: u32,
|
||||
@@ -541,7 +538,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
pub fn scratch_cuda_arithmetic_scalar_shift_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -563,7 +560,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
pub fn cuda_arithmetic_scalar_shift_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
shift: u32,
|
||||
@@ -573,19 +570,16 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_logical_scalar_shift(
|
||||
pub fn cleanup_cuda_logical_scalar_shift(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_arithmetic_scalar_shift(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
pub fn scratch_cuda_shift_and_rotate_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -608,7 +602,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
pub fn cuda_shift_and_rotate_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
lwe_shift: *const CudaRadixCiphertextFFI,
|
||||
@@ -618,13 +612,10 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_shift_and_rotate(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
pub fn cleanup_cuda_shift_and_rotate(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_comparison_kb_64(
|
||||
pub fn scratch_cuda_comparison_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -647,7 +638,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
pub fn cuda_comparison_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_1: *const CudaRadixCiphertextFFI,
|
||||
@@ -658,7 +649,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
pub fn cuda_scalar_comparison_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in: *const CudaRadixCiphertextFFI,
|
||||
@@ -674,7 +665,7 @@ unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_comparison(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_bitop_kb_64(
|
||||
pub fn scratch_cuda_bitop_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -696,7 +687,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
pub fn cuda_bitop_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_1: *const CudaRadixCiphertextFFI,
|
||||
@@ -707,7 +698,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
pub fn cuda_scalar_bitop_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_input: *const CudaRadixCiphertextFFI,
|
||||
@@ -723,7 +714,7 @@ unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_bitop(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_cmux_kb_64(
|
||||
pub fn scratch_cuda_cmux_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -744,7 +735,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
pub fn cuda_cmux_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_condition: *const CudaRadixCiphertextFFI,
|
||||
@@ -756,10 +747,10 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_cmux(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
pub fn cleanup_cuda_cmux(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
pub fn scratch_cuda_scalar_rotate_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -781,7 +772,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
pub fn cuda_scalar_rotate_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
n: u32,
|
||||
@@ -791,13 +782,10 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_scalar_rotate(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
pub fn cleanup_cuda_scalar_rotate(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
pub fn scratch_cuda_propagate_single_carry_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -819,7 +807,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
pub fn scratch_cuda_add_and_propagate_single_carry_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -841,7 +829,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_propagate_single_carry_kb_64_inplace(
|
||||
pub fn cuda_propagate_single_carry_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
carry_out: *mut CudaRadixCiphertextFFI,
|
||||
@@ -854,7 +842,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
pub fn cuda_add_and_propagate_single_carry_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lhs_array: *mut CudaRadixCiphertextFFI,
|
||||
rhs_array: *const CudaRadixCiphertextFFI,
|
||||
@@ -877,7 +865,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
pub fn scratch_cuda_integer_overflowing_sub_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -899,7 +887,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
pub fn cuda_integer_overflowing_sub_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lhs_array: *mut CudaRadixCiphertextFFI,
|
||||
rhs_array: *const CudaRadixCiphertextFFI,
|
||||
@@ -919,7 +907,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
pub fn scratch_cuda_partial_sum_ciphertexts_vec_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -941,7 +929,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
pub fn cuda_partial_sum_ciphertexts_vec_64(
|
||||
streams: CudaStreamsFFI,
|
||||
radix_lwe_out: *mut CudaRadixCiphertextFFI,
|
||||
radix_lwe_vec: *mut CudaRadixCiphertextFFI,
|
||||
@@ -951,13 +939,13 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
pub fn cleanup_cuda_partial_sum_ciphertexts_vec(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_scalar_mul_kb_64(
|
||||
pub fn scratch_cuda_integer_scalar_mul_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -978,7 +966,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
pub fn cuda_scalar_multiplication_ciphertext_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array: *mut CudaRadixCiphertextFFI,
|
||||
decomposed_scalar: *const u64,
|
||||
@@ -992,13 +980,10 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_radix_scalar_mul(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
pub fn cleanup_cuda_scalar_mul(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
pub fn scratch_cuda_integer_div_rem_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
is_signed: bool,
|
||||
mem_ptr: *mut *mut i8,
|
||||
@@ -1020,7 +1005,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
pub fn cuda_integer_div_rem_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
quotient: *mut CudaRadixCiphertextFFI,
|
||||
remainder: *mut CudaRadixCiphertextFFI,
|
||||
@@ -1081,7 +1066,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
is_signed: bool,
|
||||
@@ -1103,7 +1088,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
pub fn cuda_integer_abs_inplace_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
@@ -1116,7 +1101,7 @@ unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_abs_inplace(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
pub fn scratch_cuda_integer_are_all_comparisons_block_true_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1137,7 +1122,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
pub fn cuda_integer_are_all_comparisons_block_true_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in: *const CudaRadixCiphertextFFI,
|
||||
@@ -1154,7 +1139,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1175,7 +1160,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
pub fn cuda_integer_is_at_least_one_comparisons_block_true_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in: *const CudaRadixCiphertextFFI,
|
||||
@@ -1206,7 +1191,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_noise_squashing_kb(
|
||||
pub fn scratch_cuda_apply_noise_squashing(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
@@ -1229,7 +1214,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_apply_noise_squashing_kb(
|
||||
pub fn cuda_apply_noise_squashing(
|
||||
streams: CudaStreamsFFI,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
@@ -1239,13 +1224,10 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_apply_noise_squashing_kb(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
pub fn cleanup_cuda_apply_noise_squashing(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
pub fn scratch_cuda_sub_and_propagate_single_carry_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1267,7 +1249,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
pub fn cuda_sub_and_propagate_single_carry_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
lhs_array: *mut CudaRadixCiphertextFFI,
|
||||
rhs_array: *const CudaRadixCiphertextFFI,
|
||||
@@ -1287,7 +1269,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
pub fn scratch_cuda_integer_unsigned_scalar_div_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1308,7 +1290,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
pub fn cuda_integer_unsigned_scalar_div_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
numerator_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
@@ -1318,7 +1300,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
@@ -1362,7 +1344,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
pub fn scratch_cuda_integer_signed_scalar_div_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1383,7 +1365,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
pub fn cuda_integer_signed_scalar_div_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
numerator_ct: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
@@ -1394,13 +1376,13 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
pub fn cleanup_cuda_integer_signed_scalar_div_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
pub fn scratch_integer_unsigned_scalar_div_rem_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1422,7 +1404,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
pub fn cuda_integer_unsigned_scalar_div_rem_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
quotient_ct: *mut CudaRadixCiphertextFFI,
|
||||
remainder_ct: *mut CudaRadixCiphertextFFI,
|
||||
@@ -1439,13 +1421,13 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
pub fn scratch_integer_signed_scalar_div_rem_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1467,7 +1449,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
pub fn cuda_integer_signed_scalar_div_rem_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
quotient_ct: *mut CudaRadixCiphertextFFI,
|
||||
remainder_ct: *mut CudaRadixCiphertextFFI,
|
||||
@@ -1482,13 +1464,13 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
pub fn scratch_integer_count_of_consecutive_bits_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1511,7 +1493,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
pub fn cuda_integer_count_of_consecutive_bits_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output_ct: *mut CudaRadixCiphertextFFI,
|
||||
input_ct: *const CudaRadixCiphertextFFI,
|
||||
@@ -1521,7 +1503,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
pub fn cleanup_cuda_integer_count_of_consecutive_bits_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
@@ -1549,7 +1531,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_grouped_oprf_async_64(
|
||||
pub fn cuda_integer_grouped_oprf_64(
|
||||
streams: CudaStreamsFFI,
|
||||
radix_lwe_out: *mut CudaRadixCiphertextFFI,
|
||||
seeded_lwe_input: *const ffi::c_void,
|
||||
@@ -1565,7 +1547,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_integer_ilog2_kb_64(
|
||||
pub fn scratch_integer_ilog2_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
@@ -1587,7 +1569,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_ilog2_kb_64(
|
||||
pub fn cuda_integer_ilog2_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output_ct: *mut CudaRadixCiphertextFFI,
|
||||
input_ct: *const CudaRadixCiphertextFFI,
|
||||
@@ -1600,7 +1582,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_ilog2_kb_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextIn
|
||||
use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaVec, KsType, LweDimension};
|
||||
use crate::integer::gpu::key_switching_key::CudaKeySwitchingKey;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{expand_async, PBSType};
|
||||
use crate::integer::gpu::{cuda_backend_expand, PBSType};
|
||||
use crate::shortint::ciphertext::CompactCiphertextList;
|
||||
use crate::shortint::parameters::{
|
||||
CompactCiphertextListExpansionKind, Degree, LweBskGroupingFactor, NoiseLevel,
|
||||
@@ -409,7 +409,7 @@ impl CudaFlattenedVecCompactCiphertextList {
|
||||
|
||||
match &sks.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
expand_async(
|
||||
cuda_backend_expand(
|
||||
streams,
|
||||
&mut d_output,
|
||||
d_input,
|
||||
@@ -444,7 +444,7 @@ impl CudaFlattenedVecCompactCiphertextList {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
expand_async(
|
||||
cuda_backend_expand(
|
||||
streams,
|
||||
&mut d_output,
|
||||
d_input,
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::{
|
||||
CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
|
||||
CudaSquashedNoiseSignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::gpu::decompress_integer_radix_async_128;
|
||||
use crate::integer::gpu::cuda_backend_decompress_128;
|
||||
use crate::integer::gpu::list_compression::server_keys::{
|
||||
CudaNoiseSquashingCompressionKey, CudaPackedGlweCiphertextList,
|
||||
};
|
||||
@@ -345,7 +345,7 @@ impl CudaCompressedSquashedNoiseCiphertextList {
|
||||
);
|
||||
|
||||
unsafe {
|
||||
decompress_integer_radix_async_128(
|
||||
cuda_backend_decompress_128(
|
||||
streams,
|
||||
&mut output_lwe,
|
||||
&self.packed_list,
|
||||
|
||||
@@ -15,8 +15,8 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
|
||||
use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
|
||||
get_compression_size_on_gpu, get_decompression_size_on_gpu,
|
||||
cuda_backend_compress, cuda_backend_decompress, cuda_backend_get_compression_size_on_gpu,
|
||||
cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu,
|
||||
};
|
||||
use crate::prelude::CastInto;
|
||||
use crate::shortint::ciphertext::{
|
||||
@@ -322,7 +322,7 @@ impl CudaCompressionKey {
|
||||
unsafe {
|
||||
let input_lwes = Self::flatten_async(ciphertexts, streams);
|
||||
|
||||
compress_integer_radix_async(
|
||||
cuda_backend_compress(
|
||||
streams,
|
||||
&mut glwe_array_out,
|
||||
&input_lwes,
|
||||
@@ -355,7 +355,7 @@ impl CudaCompressionKey {
|
||||
let compressed_polynomial_size = lwe_pksk.output_polynomial_size();
|
||||
let compressed_glwe_size = lwe_pksk.output_glwe_size();
|
||||
|
||||
get_compression_size_on_gpu(
|
||||
cuda_backend_get_compression_size_on_gpu(
|
||||
streams,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
@@ -430,7 +430,7 @@ impl CudaDecompressionKey {
|
||||
);
|
||||
|
||||
unsafe {
|
||||
decompress_integer_radix_async_64(
|
||||
cuda_backend_decompress(
|
||||
streams,
|
||||
&mut output_lwe,
|
||||
packed_list,
|
||||
@@ -515,7 +515,7 @@ impl CudaDecompressionKey {
|
||||
);
|
||||
let lwe_dimension = bsk.output_lwe_dimension();
|
||||
|
||||
get_decompression_size_on_gpu(
|
||||
cuda_backend_get_decompression_size_on_gpu(
|
||||
streams,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
@@ -570,7 +570,7 @@ impl CudaDecompressionKey {
|
||||
);
|
||||
let lwe_dimension = bsk.output_lwe_dimension();
|
||||
|
||||
get_decompression_size_on_gpu(
|
||||
cuda_backend_get_decompression_size_on_gpu(
|
||||
streams,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
@@ -712,7 +712,7 @@ impl CudaNoiseSquashingCompressionKey {
|
||||
unsafe {
|
||||
let input_lwes = Self::flatten_async(ciphertexts, streams);
|
||||
|
||||
compress_integer_radix_async(
|
||||
cuda_backend_compress(
|
||||
streams,
|
||||
&mut glwe_array_out,
|
||||
&input_lwes,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,7 @@ use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{unchecked_signed_abs_radix_kb_assign_async, PBSType};
|
||||
use crate::integer::gpu::{cuda_backend_unchecked_signed_abs_assign, PBSType};
|
||||
|
||||
impl CudaServerKey {
|
||||
/// # Safety
|
||||
@@ -18,7 +18,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_signed_abs_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_signed_abs_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
&d_bsk.d_vec,
|
||||
@@ -44,7 +44,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_signed_abs_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_signed_abs_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
&d_multibit_bsk.d_vec,
|
||||
|
||||
@@ -7,10 +7,10 @@ use crate::integer::gpu::ciphertext::{
|
||||
};
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{
|
||||
add_and_propagate_single_carry_assign_async,
|
||||
get_add_and_propagate_single_carry_assign_async_size_on_gpu,
|
||||
get_full_propagate_assign_size_on_gpu, unchecked_add_integer_radix_assign_async,
|
||||
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
|
||||
cuda_backend_add_and_propagate_single_carry_assign,
|
||||
cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_unchecked_add_assign,
|
||||
cuda_backend_unchecked_partial_sum_ciphertexts_assign, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
use crate::shortint::ciphertext::NoiseLevel;
|
||||
@@ -153,23 +153,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count().0
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -199,7 +201,7 @@ impl CudaServerKey {
|
||||
let num_blocks = ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
|
||||
let add_assign_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_add_and_propagate_single_carry_assign_async_size_on_gpu(
|
||||
cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
@@ -218,7 +220,7 @@ impl CudaServerKey {
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_add_and_propagate_single_carry_assign_async_size_on_gpu(
|
||||
cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -313,7 +315,7 @@ impl CudaServerKey {
|
||||
);
|
||||
|
||||
unsafe {
|
||||
unchecked_add_integer_radix_assign_async(streams, ciphertext_left, ciphertext_right);
|
||||
cuda_backend_unchecked_add_assign(streams, ciphertext_left, ciphertext_right);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -373,7 +375,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_partial_sum_ciphertexts_assign(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
&mut terms,
|
||||
@@ -399,7 +401,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_partial_sum_ciphertexts_assign(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
&mut terms,
|
||||
@@ -833,7 +835,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
add_and_propagate_single_carry_assign_async(
|
||||
cuda_backend_add_and_propagate_single_carry_assign(
|
||||
streams,
|
||||
lhs.as_mut(),
|
||||
rhs.as_ref(),
|
||||
@@ -859,7 +861,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
add_and_propagate_single_carry_assign_async(
|
||||
cuda_backend_add_and_propagate_single_carry_assign(
|
||||
streams,
|
||||
lhs.as_mut(),
|
||||
rhs.as_ref(),
|
||||
|
||||
@@ -6,9 +6,9 @@ use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
|
||||
use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::{
|
||||
get_aes_ctr_encrypt_integer_radix_size_on_gpu, get_key_expansion_integer_radix_size_on_gpu,
|
||||
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async,
|
||||
unchecked_key_expansion_integer_radix_kb_assign_async, PBSType,
|
||||
cuda_backend_aes_key_expansion, cuda_backend_get_aes_ctr_encrypt_size_on_gpu,
|
||||
cuda_backend_get_aes_key_expansion_size_on_gpu, cuda_backend_unchecked_aes_ctr_encrypt,
|
||||
PBSType,
|
||||
};
|
||||
use crate::integer::{RadixCiphertext, RadixClientKey};
|
||||
use crate::shortint::Ciphertext;
|
||||
@@ -231,7 +231,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_aes_ctr_encrypt(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
iv.as_ref(),
|
||||
@@ -256,7 +256,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_aes_ctr_encrypt(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
iv.as_ref(),
|
||||
@@ -308,7 +308,7 @@ impl CudaServerKey {
|
||||
streams: &CudaStreams,
|
||||
) -> u64 {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_aes_ctr_encrypt_integer_radix_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
|
||||
streams,
|
||||
num_aes_inputs as u32,
|
||||
sbox_parallelism as u32,
|
||||
@@ -326,7 +326,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_aes_ctr_encrypt_integer_radix_size_on_gpu(
|
||||
cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
|
||||
streams,
|
||||
num_aes_inputs as u32,
|
||||
sbox_parallelism as u32,
|
||||
@@ -371,7 +371,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_key_expansion_integer_radix_kb_assign_async(
|
||||
cuda_backend_aes_key_expansion(
|
||||
streams,
|
||||
expanded_keys.as_mut(),
|
||||
key.as_ref(),
|
||||
@@ -392,7 +392,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_key_expansion_integer_radix_kb_assign_async(
|
||||
cuda_backend_aes_key_expansion(
|
||||
streams,
|
||||
expanded_keys.as_mut(),
|
||||
key.as_ref(),
|
||||
@@ -428,7 +428,7 @@ impl CudaServerKey {
|
||||
/// synchronization is required
|
||||
unsafe fn get_key_expansion_size_on_gpu_async(&self, streams: &CudaStreams) -> u64 {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_key_expansion_integer_radix_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_aes_key_expansion_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -444,7 +444,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_key_expansion_integer_radix_size_on_gpu(
|
||||
cuda_backend_get_aes_key_expansion_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_bitop_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
|
||||
unchecked_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
|
||||
cuda_backend_get_bitop_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_unchecked_bitop_assign, BitOpType, CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -209,7 +209,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_bitop_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_bitop_assign(
|
||||
streams,
|
||||
ct_left.as_mut(),
|
||||
ct_right.as_ref(),
|
||||
@@ -237,7 +237,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_bitop_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_bitop_assign(
|
||||
streams,
|
||||
ct_left.as_mut(),
|
||||
ct_right.as_ref(),
|
||||
@@ -283,23 +283,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -329,7 +331,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let bitop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_bitop_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_bitop_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -351,30 +353,28 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_bitop_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
op,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_bitop_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
op,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
),
|
||||
};
|
||||
actual_full_prop_mem.max(bitop_mem)
|
||||
}
|
||||
@@ -938,23 +938,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
|
||||
@@ -4,8 +4,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_cmux_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
|
||||
unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType,
|
||||
cuda_backend_get_cmux_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_unchecked_cmux, CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -27,7 +27,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_cmux_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_cmux(
|
||||
stream,
|
||||
result.as_mut(),
|
||||
condition,
|
||||
@@ -56,7 +56,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_cmux_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_cmux(
|
||||
stream,
|
||||
result.as_mut(),
|
||||
condition,
|
||||
@@ -150,23 +150,25 @@ impl CudaServerKey {
|
||||
false_ct.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -196,7 +198,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = true_ct.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let cmux_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_cmux_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_cmux_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -217,29 +219,27 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_cmux_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_cmux_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
),
|
||||
};
|
||||
actual_full_prop_mem.max(cmux_mem)
|
||||
}
|
||||
|
||||
@@ -6,8 +6,8 @@ use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
|
||||
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaRadixCiphertext};
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_comparison_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
|
||||
unchecked_comparison_integer_radix_kb_async, ComparisonType, CudaServerKey, PBSType,
|
||||
cuda_backend_get_comparison_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_unchecked_comparison, ComparisonType, CudaServerKey, PBSType,
|
||||
};
|
||||
use crate::shortint::ciphertext::Degree;
|
||||
|
||||
@@ -51,7 +51,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_comparison(
|
||||
streams,
|
||||
result.as_mut().as_mut(),
|
||||
ct_left.as_ref(),
|
||||
@@ -80,7 +80,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_comparison(
|
||||
streams,
|
||||
result.as_mut().as_mut(),
|
||||
ct_left.as_ref(),
|
||||
@@ -365,23 +365,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -411,7 +413,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let comparison_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_comparison_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_comparison_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -435,7 +437,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_comparison_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_comparison_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -1131,7 +1133,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_comparison(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct_left.as_ref(),
|
||||
@@ -1160,7 +1162,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_comparison(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct_left.as_ref(),
|
||||
@@ -1227,7 +1229,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_comparison(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct_left.as_ref(),
|
||||
@@ -1256,7 +1258,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_comparison(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct_left.as_ref(),
|
||||
|
||||
@@ -3,8 +3,8 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{
|
||||
get_div_rem_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
|
||||
unchecked_div_rem_integer_radix_kb_assign_async, PBSType,
|
||||
cuda_backend_get_div_rem_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_unchecked_div_rem_assign, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -26,7 +26,7 @@ impl CudaServerKey {
|
||||
let num_blocks = divisor.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_div_rem_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_div_rem_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
remainder.as_mut(),
|
||||
@@ -56,7 +56,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_div_rem_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_div_rem_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
remainder.as_mut(),
|
||||
@@ -258,23 +258,25 @@ impl CudaServerKey {
|
||||
divisor.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -304,7 +306,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = numerator.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let mul_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_div_rem_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_div_rem_size_on_gpu(
|
||||
streams,
|
||||
T::IS_SIGNED,
|
||||
self.message_modulus,
|
||||
@@ -326,30 +328,28 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_div_rem_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
T::IS_SIGNED,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_div_rem_size_on_gpu(
|
||||
streams,
|
||||
T::IS_SIGNED,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
),
|
||||
};
|
||||
actual_full_prop_mem.max(mul_mem)
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::integer::gpu::ciphertext::{
|
||||
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{count_of_consecutive_bits_async, ilog2_async, PBSType};
|
||||
use crate::integer::gpu::{cuda_backend_count_of_consecutive_bits, cuda_backend_ilog2, PBSType};
|
||||
use crate::integer::server_key::radix_parallel::ilog2::{BitValue, Direction};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -40,7 +40,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
count_of_consecutive_bits_async(
|
||||
cuda_backend_count_of_consecutive_bits(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -63,7 +63,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
count_of_consecutive_bits_async(
|
||||
cuda_backend_count_of_consecutive_bits(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -279,7 +279,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
ilog2_async(
|
||||
cuda_backend_ilog2(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -306,7 +306,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
ilog2_async(
|
||||
cuda_backend_ilog2(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct.as_ref(),
|
||||
|
||||
@@ -16,11 +16,12 @@ use crate::integer::gpu::ciphertext::{
|
||||
use crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
apply_bivariate_lut_kb_async, apply_many_univariate_lut_kb_async,
|
||||
apply_univariate_lut_kb_async, compute_prefix_sum_hillis_steele_async,
|
||||
extend_radix_with_sign_msb_async, extend_radix_with_trivial_zero_blocks_msb_async,
|
||||
full_propagate_assign_async, noise_squashing_async, propagate_single_carry_assign_async,
|
||||
trim_radix_blocks_lsb_async, CudaServerKey, PBSType,
|
||||
cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut,
|
||||
cuda_backend_apply_univariate_lut, cuda_backend_compute_prefix_sum_hillis_steele,
|
||||
cuda_backend_extend_radix_with_sign_msb,
|
||||
cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign,
|
||||
cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign,
|
||||
cuda_backend_trim_radix_blocks_lsb, CudaServerKey, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
use crate::shortint::ciphertext::{Degree, NoiseLevel};
|
||||
@@ -239,7 +240,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
propagate_single_carry_assign_async(
|
||||
cuda_backend_propagate_single_carry_assign(
|
||||
streams,
|
||||
ciphertext,
|
||||
carry_out.as_mut(),
|
||||
@@ -264,7 +265,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
propagate_single_carry_assign_async(
|
||||
cuda_backend_propagate_single_carry_assign(
|
||||
streams,
|
||||
ciphertext,
|
||||
carry_out.as_mut(),
|
||||
@@ -302,7 +303,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
full_propagate_assign_async(
|
||||
cuda_backend_full_propagate_assign(
|
||||
streams,
|
||||
ciphertext,
|
||||
&d_bsk.d_vec,
|
||||
@@ -323,7 +324,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
full_propagate_assign_async(
|
||||
cuda_backend_full_propagate_assign(
|
||||
streams,
|
||||
ciphertext,
|
||||
&d_multibit_bsk.d_vec,
|
||||
@@ -507,7 +508,11 @@ impl CudaServerKey {
|
||||
};
|
||||
|
||||
unsafe {
|
||||
extend_radix_with_trivial_zero_blocks_msb_async(output.as_mut(), ct.as_ref(), streams);
|
||||
cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
|
||||
output.as_mut(),
|
||||
ct.as_ref(),
|
||||
streams,
|
||||
);
|
||||
}
|
||||
output
|
||||
}
|
||||
@@ -581,7 +586,7 @@ impl CudaServerKey {
|
||||
unsafe { self.create_trivial_zero_radix_async(output_num_blocks, streams) };
|
||||
|
||||
unsafe {
|
||||
trim_radix_blocks_lsb_async(output.as_mut(), ct.as_ref(), streams);
|
||||
cuda_backend_trim_radix_blocks_lsb(output.as_mut(), ct.as_ref(), streams);
|
||||
}
|
||||
|
||||
output
|
||||
@@ -791,7 +796,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
apply_univariate_lut_kb_async(
|
||||
cuda_backend_apply_univariate_lut(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -819,7 +824,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
apply_univariate_lut_kb_async(
|
||||
cuda_backend_apply_univariate_lut(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -909,7 +914,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
apply_bivariate_lut_kb_async(
|
||||
cuda_backend_apply_bivariate_lut(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -939,7 +944,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
apply_bivariate_lut_kb_async(
|
||||
cuda_backend_apply_bivariate_lut(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -1088,7 +1093,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
apply_many_univariate_lut_kb_async(
|
||||
cuda_backend_apply_many_univariate_lut(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -1118,7 +1123,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
apply_many_univariate_lut_kb_async(
|
||||
cuda_backend_apply_many_univariate_lut(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -1229,7 +1234,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
compute_prefix_sum_hillis_steele_async(
|
||||
cuda_backend_compute_prefix_sum_hillis_steele(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -1259,7 +1264,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
compute_prefix_sum_hillis_steele_async(
|
||||
cuda_backend_compute_prefix_sum_hillis_steele(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -1324,7 +1329,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
extend_radix_with_sign_msb_async(
|
||||
cuda_backend_extend_radix_with_sign_msb(
|
||||
streams,
|
||||
output.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -1346,7 +1351,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
extend_radix_with_sign_msb_async(
|
||||
cuda_backend_extend_radix_with_sign_msb(
|
||||
streams,
|
||||
output.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -1638,7 +1643,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &d_bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(bsk) => {
|
||||
noise_squashing_async(
|
||||
cuda_backend_noise_squashing(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
@@ -1667,7 +1672,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(mb_bsk) => {
|
||||
noise_squashing_async(
|
||||
cuda_backend_noise_squashing(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
|
||||
@@ -3,8 +3,8 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_mul_integer_radix_kb_size_on_gpu,
|
||||
unchecked_mul_integer_radix_kb_assign_async, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_mul_size_on_gpu,
|
||||
cuda_backend_unchecked_mul_assign, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -80,7 +80,7 @@ impl CudaServerKey {
|
||||
let is_boolean_right = ct_right.holds_boolean_value();
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_mul_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_mul_assign(
|
||||
streams,
|
||||
ct_left.as_mut(),
|
||||
is_boolean_left,
|
||||
@@ -104,7 +104,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_mul_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_mul_assign(
|
||||
streams,
|
||||
ct_left.as_mut(),
|
||||
is_boolean_left,
|
||||
@@ -264,23 +264,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -312,7 +314,7 @@ impl CudaServerKey {
|
||||
let is_boolean_right = ct_right.holds_boolean_value();
|
||||
|
||||
let mul_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_mul_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_mul_size_on_gpu(
|
||||
streams,
|
||||
is_boolean_left,
|
||||
is_boolean_right,
|
||||
@@ -332,7 +334,7 @@ impl CudaServerKey {
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => get_mul_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_mul_size_on_gpu(
|
||||
streams,
|
||||
is_boolean_left,
|
||||
is_boolean_right,
|
||||
|
||||
@@ -3,8 +3,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::integer::gpu::ciphertext::{
|
||||
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::gpu::cuda_backend_unchecked_negate;
|
||||
use crate::integer::gpu::server_key::CudaServerKey;
|
||||
use crate::integer::gpu::unchecked_negate_integer_radix_async;
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -70,7 +70,7 @@ impl CudaServerKey {
|
||||
|
||||
let info = ctxt.as_ref().info.blocks.first().unwrap();
|
||||
|
||||
unchecked_negate_integer_radix_async(
|
||||
cuda_backend_unchecked_negate(
|
||||
streams,
|
||||
ciphertext_out.as_mut(),
|
||||
ctxt.as_ref(),
|
||||
|
||||
@@ -12,7 +12,9 @@ use crate::shortint::oprf::{create_random_from_seed_modulus_switched, raw_seeded
|
||||
|
||||
pub use tfhe_csprng::seeders::{Seed, Seeder};
|
||||
|
||||
use crate::integer::gpu::{get_grouped_oprf_size_on_gpu, grouped_oprf_async, CudaVec, PBSType};
|
||||
use crate::integer::gpu::{
|
||||
cuda_backend_get_grouped_oprf_size_on_gpu, cuda_backend_grouped_oprf, CudaVec, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
/// Generates an encrypted `num_block` blocks unsigned integer
|
||||
@@ -372,7 +374,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
grouped_oprf_async(
|
||||
cuda_backend_grouped_oprf(
|
||||
streams,
|
||||
result,
|
||||
&d_seeded_lwe_input,
|
||||
@@ -395,7 +397,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_bsk) => {
|
||||
grouped_oprf_async(
|
||||
cuda_backend_grouped_oprf(
|
||||
streams,
|
||||
result,
|
||||
&d_seeded_lwe_input,
|
||||
@@ -429,7 +431,7 @@ impl CudaServerKey {
|
||||
let message_bits = self.message_modulus.0.ilog2();
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_grouped_oprf_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_grouped_oprf_size_on_gpu(
|
||||
streams,
|
||||
1,
|
||||
d_bsk.input_lwe_dimension,
|
||||
@@ -447,7 +449,7 @@ impl CudaServerKey {
|
||||
message_bits,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_bsk) => get_grouped_oprf_size_on_gpu(
|
||||
CudaBootstrappingKey::MultiBit(d_bsk) => cuda_backend_get_grouped_oprf_size_on_gpu(
|
||||
streams,
|
||||
1,
|
||||
d_bsk.input_lwe_dimension,
|
||||
|
||||
@@ -3,10 +3,9 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_rotate_left_integer_radix_kb_size_on_gpu,
|
||||
get_rotate_right_integer_radix_kb_size_on_gpu,
|
||||
unchecked_rotate_left_integer_radix_kb_assign_async,
|
||||
unchecked_rotate_right_integer_radix_kb_assign_async, CudaServerKey, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_rotate_left_size_on_gpu,
|
||||
cuda_backend_get_rotate_right_size_on_gpu, cuda_backend_unchecked_rotate_left_assign,
|
||||
cuda_backend_unchecked_rotate_right_assign, CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -27,7 +26,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_rotate_right_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_rotate_right_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
rotate.as_ref(),
|
||||
@@ -55,7 +54,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_rotate_right_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_rotate_right_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
rotate.as_ref(),
|
||||
@@ -148,7 +147,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_rotate_left_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_rotate_left_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
rotate.as_ref(),
|
||||
@@ -176,7 +175,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_rotate_left_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_rotate_left_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
rotate.as_ref(),
|
||||
@@ -574,23 +573,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -620,7 +621,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let rotate_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_rotate_left_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_rotate_left_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -643,7 +644,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_rotate_left_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_rotate_left_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -685,23 +686,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -731,7 +734,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let rotate_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_rotate_right_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_rotate_right_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -754,7 +757,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_rotate_right_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_rotate_right_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
|
||||
@@ -8,8 +8,9 @@ use crate::integer::gpu::ciphertext::{
|
||||
};
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_propagate_single_carry_assign_async_size_on_gpu,
|
||||
scalar_addition_integer_radix_assign_async, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_get_propagate_single_carry_assign_size_on_gpu,
|
||||
cuda_backend_scalar_addition_assign, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
use crate::prelude::CastInto;
|
||||
@@ -97,7 +98,7 @@ impl CudaServerKey {
|
||||
|
||||
// If the scalar is decomposed using less than the number of blocks our ciphertext
|
||||
// has, we just don't touch ciphertext's last blocks
|
||||
scalar_addition_integer_radix_assign_async(
|
||||
cuda_backend_scalar_addition_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
&d_decomposed_scalar,
|
||||
@@ -208,23 +209,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -246,7 +249,7 @@ impl CudaServerKey {
|
||||
let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
|
||||
let single_carry_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_propagate_single_carry_assign_async_size_on_gpu(
|
||||
cuda_backend_get_propagate_single_carry_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
@@ -265,7 +268,7 @@ impl CudaServerKey {
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_propagate_single_carry_assign_async_size_on_gpu(
|
||||
cuda_backend_get_propagate_single_carry_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
|
||||
@@ -5,8 +5,8 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_scalar_bitop_integer_radix_kb_size_on_gpu,
|
||||
unchecked_scalar_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_scalar_bitop_size_on_gpu,
|
||||
cuda_backend_unchecked_scalar_bitop_assign, BitOpType, CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -36,7 +36,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_bitop_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_bitop_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
&clear_blocks,
|
||||
@@ -65,7 +65,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_bitop_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_bitop_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
&clear_blocks,
|
||||
@@ -315,23 +315,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -352,7 +354,7 @@ impl CudaServerKey {
|
||||
let clear_blocks_mem = (lwe_ciphertext_count.0 * size_of::<u64>()) as u64;
|
||||
|
||||
let scalar_bitop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_bitop_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_bitop_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -375,7 +377,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_bitop_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_bitop_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
|
||||
@@ -8,9 +8,9 @@ use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
|
||||
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaRadixCiphertext};
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{
|
||||
unchecked_are_all_comparisons_block_true_integer_radix_kb_async,
|
||||
unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async,
|
||||
unchecked_scalar_comparison_integer_radix_kb_async, ComparisonType, PBSType,
|
||||
cuda_backend_unchecked_are_all_comparisons_block_true,
|
||||
cuda_backend_unchecked_is_at_least_one_comparisons_block_true,
|
||||
cuda_backend_unchecked_scalar_comparison, ComparisonType, PBSType,
|
||||
};
|
||||
use crate::shortint::ciphertext::Degree;
|
||||
|
||||
@@ -124,7 +124,7 @@ impl CudaServerKey {
|
||||
ComparisonType::GT | ComparisonType::GE | ComparisonType::NE => 1,
|
||||
_ => 0,
|
||||
};
|
||||
let ct_res: T = self.create_trivial_radix(value, 1, streams);
|
||||
let ct_res: T = self.create_trivial_radix_async(value, 1, streams);
|
||||
return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
|
||||
}
|
||||
|
||||
@@ -146,7 +146,7 @@ impl CudaServerKey {
|
||||
ComparisonType::LT | ComparisonType::LE | ComparisonType::NE => 1,
|
||||
_ => 0,
|
||||
};
|
||||
let ct_res: T = self.create_trivial_radix(value, 1, streams);
|
||||
let ct_res: T = self.create_trivial_radix_async(value, 1, streams);
|
||||
return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
|
||||
}
|
||||
|
||||
@@ -173,7 +173,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_scalar_comparison(
|
||||
streams,
|
||||
result.as_mut().as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -204,7 +204,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_scalar_comparison(
|
||||
streams,
|
||||
result.as_mut().as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -261,9 +261,9 @@ impl CudaServerKey {
|
||||
// Scalar is greater than the bounds, so ciphertext is smaller
|
||||
let result: T = match op {
|
||||
ComparisonType::LT | ComparisonType::LE => {
|
||||
self.create_trivial_radix(1, num_blocks, streams)
|
||||
self.create_trivial_radix_async(1, num_blocks, streams)
|
||||
}
|
||||
_ => self.create_trivial_radix(
|
||||
_ => self.create_trivial_radix_async(
|
||||
0,
|
||||
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
|
||||
streams,
|
||||
@@ -275,9 +275,9 @@ impl CudaServerKey {
|
||||
// Scalar is smaller than the bounds, so ciphertext is bigger
|
||||
let result: T = match op {
|
||||
ComparisonType::GT | ComparisonType::GE => {
|
||||
self.create_trivial_radix(1, num_blocks, streams)
|
||||
self.create_trivial_radix_async(1, num_blocks, streams)
|
||||
}
|
||||
_ => self.create_trivial_radix(
|
||||
_ => self.create_trivial_radix_async(
|
||||
0,
|
||||
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
|
||||
streams,
|
||||
@@ -296,7 +296,8 @@ impl CudaServerKey {
|
||||
ct, scalar, op, true, streams,
|
||||
)
|
||||
} else {
|
||||
let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams);
|
||||
let scalar_as_trivial =
|
||||
self.create_trivial_radix_async(scalar, num_blocks, streams);
|
||||
self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams)
|
||||
}
|
||||
} else {
|
||||
@@ -334,7 +335,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_scalar_comparison(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -365,7 +366,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_comparison_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_scalar_comparison(
|
||||
streams,
|
||||
result.as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -412,7 +413,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_are_all_comparisons_block_true_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_are_all_comparisons_block_true(
|
||||
streams,
|
||||
boolean_res.as_mut().as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -438,7 +439,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_are_all_comparisons_block_true_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_are_all_comparisons_block_true(
|
||||
streams,
|
||||
boolean_res.as_mut().as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -482,7 +483,7 @@ impl CudaServerKey {
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_is_at_least_one_comparisons_block_true(
|
||||
streams,
|
||||
boolean_res.as_mut().as_mut(),
|
||||
ct.as_ref(),
|
||||
@@ -508,7 +509,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_is_at_least_one_comparisons_block_true(
|
||||
streams,
|
||||
boolean_res.as_mut().as_mut(),
|
||||
ct.as_ref(),
|
||||
|
||||
@@ -6,14 +6,14 @@ use crate::integer::gpu::ciphertext::{
|
||||
};
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_scalar_div_integer_radix_kb_size_on_gpu,
|
||||
get_scalar_div_rem_integer_radix_kb_size_on_gpu,
|
||||
get_signed_scalar_div_integer_radix_kb_size_on_gpu,
|
||||
get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu,
|
||||
unchecked_signed_scalar_div_integer_radix_kb_assign_async,
|
||||
unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async,
|
||||
unchecked_unsigned_scalar_div_integer_radix_kb_assign_async,
|
||||
unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async, CudaServerKey, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_get_scalar_div_rem_size_on_gpu, cuda_backend_get_scalar_div_size_on_gpu,
|
||||
cuda_backend_get_signed_scalar_div_rem_size_on_gpu,
|
||||
cuda_backend_get_signed_scalar_div_size_on_gpu,
|
||||
cuda_backend_unchecked_signed_scalar_div_assign,
|
||||
cuda_backend_unchecked_signed_scalar_div_rem_assign,
|
||||
cuda_backend_unchecked_unsigned_scalar_div_assign,
|
||||
cuda_backend_unchecked_unsigned_scalar_div_rem, CudaServerKey, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
@@ -106,7 +106,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_unsigned_scalar_div_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_unsigned_scalar_div_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
divisor,
|
||||
@@ -127,7 +127,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_unsigned_scalar_div_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_unsigned_scalar_div_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
divisor,
|
||||
@@ -281,7 +281,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_unsigned_scalar_div_rem(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
remainder.as_mut(),
|
||||
@@ -303,7 +303,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_unsigned_scalar_div_rem(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
remainder.as_mut(),
|
||||
@@ -549,11 +549,11 @@ impl CudaServerKey {
|
||||
>= to the number of bits encrypted in the ciphertext"
|
||||
);
|
||||
|
||||
let mut quotient: CudaSignedRadixCiphertext = numerator.duplicate_async(streams);
|
||||
let mut quotient: CudaSignedRadixCiphertext = numerator.duplicate(streams);
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_signed_scalar_div_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_signed_scalar_div_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
divisor,
|
||||
@@ -574,7 +574,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_signed_scalar_div_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_signed_scalar_div_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
divisor,
|
||||
@@ -729,7 +729,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_signed_scalar_div_rem_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
remainder.as_mut(),
|
||||
@@ -751,7 +751,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_signed_scalar_div_rem_assign(
|
||||
streams,
|
||||
quotient.as_mut(),
|
||||
remainder.as_mut(),
|
||||
@@ -885,8 +885,7 @@ impl CudaServerKey {
|
||||
Scalar: SignedReciprocable + ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
|
||||
<<Scalar as SignedReciprocable>::Unsigned as Reciprocable>::DoublePrecision: Send,
|
||||
{
|
||||
let (_, remainder) =
|
||||
self.unchecked_signed_scalar_div_rem_async(numerator, divisor, streams);
|
||||
let (_, remainder) = self.unchecked_signed_scalar_div_rem(numerator, divisor, streams);
|
||||
|
||||
remainder
|
||||
}
|
||||
@@ -992,23 +991,25 @@ encrypted bits: {numerator_bits}, scalar bits: {}
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -1028,7 +1029,7 @@ encrypted bits: {numerator_bits}, scalar bits: {}
|
||||
};
|
||||
|
||||
let scalar_div_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_div_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_div_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
@@ -1046,7 +1047,7 @@ encrypted bits: {numerator_bits}, scalar bits: {}
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_div_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_div_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
@@ -1092,46 +1093,42 @@ encrypted bits: {numerator_bits}, scalar bits: {}
|
||||
Scalar::BITS
|
||||
);
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_scalar_div_rem_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
d_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
LweBskGroupingFactor(0),
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_div_rem_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
d_multibit_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
None,
|
||||
)
|
||||
}
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_div_rem_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
d_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
LweBskGroupingFactor(0),
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
cuda_backend_get_scalar_div_rem_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
d_multibit_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
None,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1174,27 +1171,25 @@ encrypted bits: {numerator_bits}, scalar bits: {}
|
||||
);
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_signed_scalar_div_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
d_bsk.input_lwe_dimension,
|
||||
d_bsk.decomp_base_log,
|
||||
d_bsk.decomp_level_count,
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
LweBskGroupingFactor(0),
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_signed_scalar_div_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
d_bsk.input_lwe_dimension,
|
||||
d_bsk.decomp_base_log,
|
||||
d_bsk.decomp_level_count,
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
LweBskGroupingFactor(0),
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_signed_scalar_div_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_signed_scalar_div_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
@@ -1236,46 +1231,44 @@ encrypted bits: {numerator_bits}, scalar bits: {}
|
||||
>= to the number of bits encrypted in the ciphertext"
|
||||
);
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
d_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
LweBskGroupingFactor(0),
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
d_multibit_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
None,
|
||||
)
|
||||
}
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_signed_scalar_div_rem_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
d_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
LweBskGroupingFactor(0),
|
||||
num_blocks,
|
||||
PBSType::Classical,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
cuda_backend_get_signed_scalar_div_rem_size_on_gpu(
|
||||
streams,
|
||||
divisor,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
d_multibit_bsk.input_lwe_dimension,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
num_blocks,
|
||||
PBSType::MultiBit,
|
||||
None,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,8 +4,8 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_scalar_mul_integer_radix_kb_size_on_gpu,
|
||||
unchecked_scalar_mul_integer_radix_kb_async, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_scalar_mul_size_on_gpu,
|
||||
cuda_backend_unchecked_scalar_mul, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::ScalarMultiplier;
|
||||
use crate::prelude::CastInto;
|
||||
@@ -114,7 +114,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_mul_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_scalar_mul(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
decomposed_scalar.as_slice(),
|
||||
@@ -139,7 +139,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_mul_integer_radix_kb_async(
|
||||
cuda_backend_unchecked_scalar_mul(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
decomposed_scalar.as_slice(),
|
||||
@@ -286,23 +286,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -328,7 +330,7 @@ impl CudaServerKey {
|
||||
return 0;
|
||||
}
|
||||
let scalar_mul_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_mul_size_on_gpu(
|
||||
streams,
|
||||
decomposed_scalar.as_slice(),
|
||||
self.message_modulus,
|
||||
@@ -348,7 +350,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_mul_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_mul_size_on_gpu(
|
||||
streams,
|
||||
decomposed_scalar.as_slice(),
|
||||
self.message_modulus,
|
||||
|
||||
@@ -3,10 +3,11 @@ use crate::core_crypto::prelude::{CastFrom, LweBskGroupingFactor};
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_scalar_rotate_left_integer_radix_kb_size_on_gpu,
|
||||
get_scalar_rotate_right_integer_radix_kb_size_on_gpu,
|
||||
unchecked_scalar_rotate_left_integer_radix_kb_assign_async,
|
||||
unchecked_scalar_rotate_right_integer_radix_kb_assign_async, CudaServerKey, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_get_scalar_rotate_left_size_on_gpu,
|
||||
cuda_backend_unchecked_scalar_rotate_left_assign,
|
||||
cuda_backend_unchecked_scalar_rotate_right_assign, get_scalar_rotate_right_size_on_gpu,
|
||||
CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -47,7 +48,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_rotate_left_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_rotate_left_assign(
|
||||
stream,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(n),
|
||||
@@ -74,7 +75,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_rotate_left_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_rotate_left_assign(
|
||||
stream,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(n),
|
||||
@@ -156,7 +157,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_rotate_right_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_rotate_right_assign(
|
||||
stream,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(n),
|
||||
@@ -183,7 +184,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_rotate_right_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_rotate_right_assign(
|
||||
stream,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(n),
|
||||
@@ -287,23 +288,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -323,7 +326,7 @@ impl CudaServerKey {
|
||||
};
|
||||
let scalar_shift_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_scalar_rotate_left_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_rotate_left_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -346,7 +349,7 @@ impl CudaServerKey {
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_rotate_left_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_rotate_left_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -382,23 +385,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -417,52 +422,48 @@ impl CudaServerKey {
|
||||
}
|
||||
};
|
||||
let scalar_shift_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_scalar_rotate_right_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_rotate_right_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_rotate_right_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => get_scalar_rotate_right_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
),
|
||||
};
|
||||
full_prop_mem.max(scalar_shift_mem)
|
||||
}
|
||||
|
||||
@@ -3,13 +3,13 @@ use crate::core_crypto::prelude::{CastFrom, LweBskGroupingFactor};
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu,
|
||||
get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu,
|
||||
get_scalar_left_shift_integer_radix_kb_size_on_gpu,
|
||||
get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu,
|
||||
unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async,
|
||||
unchecked_scalar_left_shift_integer_radix_kb_assign_async,
|
||||
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async, CudaServerKey, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu,
|
||||
cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu,
|
||||
cuda_backend_get_scalar_left_shift_size_on_gpu,
|
||||
cuda_backend_get_scalar_logical_right_shift_size_on_gpu,
|
||||
cuda_backend_unchecked_scalar_arithmetic_right_shift_assign,
|
||||
cuda_backend_unchecked_scalar_left_shift_assign,
|
||||
cuda_backend_unchecked_scalar_logical_right_shift_assign, CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -51,7 +51,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_left_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_left_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -78,7 +78,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_left_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_left_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -198,7 +198,7 @@ impl CudaServerKey {
|
||||
if T::IS_SIGNED {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_arithmetic_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -224,7 +224,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_arithmetic_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -253,7 +253,7 @@ impl CudaServerKey {
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_logical_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -280,7 +280,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_logical_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -596,7 +596,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_logical_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -623,7 +623,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_scalar_logical_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
u32::cast_from(shift),
|
||||
@@ -662,23 +662,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -697,31 +699,29 @@ impl CudaServerKey {
|
||||
}
|
||||
};
|
||||
let scalar_shift_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_scalar_left_shift_integer_radix_kb_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_left_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
lwe_ciphertext_count.0 as u32,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_left_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_left_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -757,23 +757,25 @@ impl CudaServerKey {
|
||||
0
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -794,7 +796,7 @@ impl CudaServerKey {
|
||||
let scalar_shift_mem = if T::IS_SIGNED {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -817,7 +819,7 @@ impl CudaServerKey {
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -843,7 +845,7 @@ impl CudaServerKey {
|
||||
} else {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_logical_right_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -866,7 +868,7 @@ impl CudaServerKey {
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_scalar_logical_right_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
|
||||
@@ -3,10 +3,9 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
|
||||
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_full_propagate_assign_size_on_gpu, get_left_shift_integer_radix_kb_size_on_gpu,
|
||||
get_right_shift_integer_radix_kb_size_on_gpu,
|
||||
unchecked_left_shift_integer_radix_kb_assign_async,
|
||||
unchecked_right_shift_integer_radix_kb_assign_async, CudaServerKey, PBSType,
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_left_shift_size_on_gpu,
|
||||
cuda_backend_get_right_shift_size_on_gpu, cuda_backend_unchecked_left_shift_assign,
|
||||
cuda_backend_unchecked_right_shift_assign, CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -27,7 +26,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
shift.as_ref(),
|
||||
@@ -55,7 +54,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_right_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_right_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
shift.as_ref(),
|
||||
@@ -146,7 +145,7 @@ impl CudaServerKey {
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_left_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_left_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
shift.as_ref(),
|
||||
@@ -174,7 +173,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_left_shift_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_left_shift_assign(
|
||||
streams,
|
||||
ct.as_mut(),
|
||||
shift.as_ref(),
|
||||
@@ -569,23 +568,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -615,7 +616,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let shift_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_left_shift_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_left_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -638,7 +639,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_left_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_left_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -680,23 +681,25 @@ impl CudaServerKey {
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
let full_prop_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_bsk.input_lwe_dimension(),
|
||||
d_bsk.glwe_dimension(),
|
||||
d_bsk.polynomial_size(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count(),
|
||||
d_bsk.decomp_base_log(),
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_full_propagate_assign_size_on_gpu(
|
||||
cuda_backend_get_full_propagate_assign_size_on_gpu(
|
||||
streams,
|
||||
d_multibit_bsk.input_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
@@ -726,7 +729,7 @@ impl CudaServerKey {
|
||||
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
|
||||
|
||||
let shift_mem = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => get_right_shift_integer_radix_kb_size_on_gpu(
|
||||
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_right_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
@@ -749,7 +752,7 @@ impl CudaServerKey {
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
),
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
get_right_shift_integer_radix_kb_size_on_gpu(
|
||||
cuda_backend_get_right_shift_size_on_gpu(
|
||||
streams,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
|
||||
@@ -8,8 +8,8 @@ use crate::integer::gpu::server_key::CudaServerKey;
|
||||
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
sub_and_propagate_single_carry_assign_async,
|
||||
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType,
|
||||
cuda_backend_sub_and_propagate_single_carry_assign,
|
||||
cuda_backend_unchecked_unsigned_overflowing_sub_assign, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
use crate::shortint::parameters::LweBskGroupingFactor;
|
||||
@@ -264,7 +264,7 @@ impl CudaServerKey {
|
||||
) {
|
||||
(true, true) => (ct_left, ct_right),
|
||||
(true, false) => {
|
||||
tmp_rhs = ct_right.duplicate_async(streams);
|
||||
tmp_rhs = ct_right.duplicate(streams);
|
||||
self.full_propagate_assign_async(&mut tmp_rhs, streams);
|
||||
(ct_left, &tmp_rhs)
|
||||
}
|
||||
@@ -273,7 +273,7 @@ impl CudaServerKey {
|
||||
(ct_left, ct_right)
|
||||
}
|
||||
(false, false) => {
|
||||
tmp_rhs = ct_right.duplicate_async(streams);
|
||||
tmp_rhs = ct_right.duplicate(streams);
|
||||
|
||||
self.full_propagate_assign_async(ct_left, streams);
|
||||
self.full_propagate_assign_async(&mut tmp_rhs, streams);
|
||||
@@ -281,13 +281,8 @@ impl CudaServerKey {
|
||||
}
|
||||
};
|
||||
|
||||
let _carry = self.sub_and_propagate_single_carry_assign_async(
|
||||
lhs,
|
||||
rhs,
|
||||
streams,
|
||||
None,
|
||||
OutputFlag::None,
|
||||
);
|
||||
let _carry =
|
||||
self.sub_and_propagate_single_carry_assign(lhs, rhs, streams, None, OutputFlag::None);
|
||||
}
|
||||
|
||||
pub fn get_sub_assign_size_on_gpu<T: CudaIntegerRadixCiphertext>(
|
||||
@@ -314,22 +309,22 @@ impl CudaServerKey {
|
||||
(true, true) => (ct_left, ct_right),
|
||||
(true, false) => {
|
||||
unsafe {
|
||||
tmp_rhs = ct_right.duplicate_async(stream);
|
||||
tmp_rhs = ct_right.duplicate(stream);
|
||||
self.full_propagate_assign_async(&mut tmp_rhs, stream);
|
||||
}
|
||||
(ct_left, &tmp_rhs)
|
||||
}
|
||||
(false, true) => {
|
||||
unsafe {
|
||||
tmp_lhs = ct_left.duplicate_async(stream);
|
||||
tmp_lhs = ct_left.duplicate(stream);
|
||||
self.full_propagate_assign_async(&mut tmp_lhs, stream);
|
||||
}
|
||||
(&tmp_lhs, ct_right)
|
||||
}
|
||||
(false, false) => {
|
||||
unsafe {
|
||||
tmp_lhs = ct_left.duplicate_async(stream);
|
||||
tmp_rhs = ct_right.duplicate_async(stream);
|
||||
tmp_lhs = ct_left.duplicate(stream);
|
||||
tmp_rhs = ct_right.duplicate(stream);
|
||||
|
||||
self.full_propagate_assign_async(&mut tmp_lhs, stream);
|
||||
self.full_propagate_assign_async(&mut tmp_rhs, stream);
|
||||
@@ -383,17 +378,18 @@ impl CudaServerKey {
|
||||
const INPUT_BORROW: Option<&CudaBooleanBlock> = None;
|
||||
|
||||
let mut overflow_block: CudaUnsignedRadixCiphertext =
|
||||
self.create_trivial_zero_radix(1, stream);
|
||||
self.create_trivial_zero_radix_async(1, stream);
|
||||
let ciphertext = ct_res.as_mut();
|
||||
let uses_input_borrow = INPUT_BORROW.map_or(0u32, |_block| 1u32);
|
||||
|
||||
let aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
|
||||
let aux_block: CudaUnsignedRadixCiphertext =
|
||||
self.create_trivial_zero_radix_async(1, stream);
|
||||
let in_carry_dvec =
|
||||
INPUT_BORROW.map_or_else(|| aux_block.as_ref(), |block| block.as_ref().as_ref());
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_unsigned_overflowing_sub_assign(
|
||||
stream,
|
||||
ciphertext,
|
||||
rhs.as_ref(),
|
||||
@@ -418,7 +414,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
|
||||
cuda_backend_unchecked_unsigned_overflowing_sub_assign(
|
||||
stream,
|
||||
ciphertext,
|
||||
rhs.as_ref(),
|
||||
@@ -452,7 +448,7 @@ impl CudaServerKey {
|
||||
///
|
||||
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
|
||||
/// not be dropped until streams is synchronized
|
||||
pub(crate) unsafe fn sub_and_propagate_single_carry_assign_async<T>(
|
||||
pub(crate) unsafe fn sub_and_propagate_single_carry_assign<T>(
|
||||
&self,
|
||||
lhs: &mut T,
|
||||
rhs: &T,
|
||||
@@ -463,17 +459,17 @@ impl CudaServerKey {
|
||||
where
|
||||
T: CudaIntegerRadixCiphertext,
|
||||
{
|
||||
let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
|
||||
let mut carry_out: T = self.create_trivial_zero_radix_async(1, streams);
|
||||
|
||||
let num_blocks = lhs.as_mut().d_blocks.lwe_ciphertext_count().0 as u32;
|
||||
let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
|
||||
let aux_block: T = self.create_trivial_zero_radix(1, streams);
|
||||
let aux_block: T = self.create_trivial_zero_radix_async(1, streams);
|
||||
let in_carry: &CudaRadixCiphertext =
|
||||
input_carry.map_or_else(|| aux_block.as_ref(), |block| block.0.as_ref());
|
||||
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
sub_and_propagate_single_carry_assign_async(
|
||||
cuda_backend_sub_and_propagate_single_carry_assign(
|
||||
streams,
|
||||
lhs.as_mut(),
|
||||
rhs.as_ref(),
|
||||
@@ -499,7 +495,7 @@ impl CudaServerKey {
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
sub_and_propagate_single_carry_assign_async(
|
||||
cuda_backend_sub_and_propagate_single_carry_assign(
|
||||
streams,
|
||||
lhs.as_mut(),
|
||||
rhs.as_ref(),
|
||||
|
||||
Reference in New Issue
Block a user