chore(gpu): internal renaming

This commit is contained in:
Agnes Leroy
2025-10-13 13:59:18 +02:00
committed by Agnès Leroy
parent 6347f25668
commit c3ed1a7558
70 changed files with 1920 additions and 1965 deletions

View File

@@ -106,7 +106,7 @@ typedef struct {
uint32_t polynomial_size;
} CudaPackedGlweCiphertextListFFI;
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
uint64_t scratch_cuda_apply_univariate_lut_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -114,7 +114,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
uint64_t scratch_cuda_apply_many_univariate_lut_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -122,15 +122,16 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_apply_univariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks);
void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks);
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
uint64_t scratch_cuda_apply_bivariate_lut_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -139,17 +140,17 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
uint32_t carry_modulus, PBS_TYPE pbs_type, uint64_t lut_degree,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_apply_bivariate_lut_kb_64(
void cuda_apply_bivariate_lut_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe_1,
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift);
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cuda_apply_many_univariate_lut_kb_64(
void cuda_apply_many_univariate_lut_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_luts,
@@ -171,7 +172,7 @@ void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
bool const is_boolean_right, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
@@ -180,7 +181,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
uint32_t num_blocks, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_mult_radix_ciphertext_kb_64(
void cuda_integer_mult_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
@@ -189,17 +190,18 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cuda_negate_integer_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t num_radix_blocks);
void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
uint32_t message_modulus, uint32_t carry_modulus,
uint32_t num_radix_blocks);
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void cuda_scalar_addition_ciphertext_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
uint32_t message_modulus, uint32_t carry_modulus);
uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
uint64_t scratch_cuda_logical_scalar_shift_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -208,11 +210,12 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t shift, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
uint64_t scratch_cuda_arithmetic_scalar_shift_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -221,17 +224,19 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t shift, int8_t *mem_ptr,
void *const *bsks,
void *const *ksks);
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
uint64_t scratch_cuda_shift_and_rotate_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -240,15 +245,16 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_radix_comparison_kb_64(
uint64_t scratch_cuda_comparison_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -258,13 +264,14 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
bool is_signed, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_comparison_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cuda_scalar_comparison_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
@@ -273,7 +280,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_radix_bitop_kb_64(
uint64_t scratch_cuda_bitop_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -282,13 +289,14 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_bitop_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void cuda_scalar_bitop_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
@@ -296,26 +304,28 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_radix_cmux_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_cmux_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
uint64_t scratch_cuda_scalar_rotate_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -324,14 +334,14 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t n, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void);
uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
uint64_t scratch_cuda_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -340,7 +350,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -349,13 +359,13 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_propagate_single_carry_kb_64_inplace(
void cuda_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t requested_flag, uint32_t uses_carry);
void cuda_add_and_propagate_single_carry_kb_64_inplace(
void cuda_add_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
@@ -367,7 +377,7 @@ void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -376,7 +386,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
PBS_TYPE pbs_type, uint32_t compute_overflow, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_overflowing_sub_kb_64_inplace(
void cuda_integer_overflowing_sub_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
@@ -387,7 +397,7 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -397,15 +407,16 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint64_t scratch_cuda_integer_scalar_mul_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -413,16 +424,15 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_scalar_bits,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void cuda_scalar_multiplication_ciphertext_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -431,7 +441,7 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void cuda_integer_div_rem_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
@@ -460,7 +470,7 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array);
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -469,14 +479,14 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void cuda_integer_abs_inplace_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
bool is_signed, void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -485,7 +495,7 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_are_all_comparisons_block_true_kb_64(
void cuda_integer_are_all_comparisons_block_true_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
@@ -493,7 +503,7 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
void cleanup_cuda_integer_are_all_comparisons_block_true(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -502,7 +512,7 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void cuda_integer_is_at_least_one_comparisons_block_true_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
@@ -518,7 +528,7 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
CudaStreamsFFI streams);
uint64_t scratch_cuda_apply_noise_squashing_kb(
uint64_t scratch_cuda_apply_noise_squashing(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
@@ -528,15 +538,16 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_apply_noise_squashing_kb(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks);
void cuda_apply_noise_squashing(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks);
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -545,7 +556,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
void cuda_sub_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
@@ -554,7 +565,7 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -563,13 +574,13 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_unsigned_scalar_div_radix_kb_64(
void cuda_integer_unsigned_scalar_div_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi);
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
@@ -590,7 +601,7 @@ void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -599,15 +610,15 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
const CudaScalarDivisorFFI *scalar_divisor_ffi, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_signed_scalar_div_radix_kb_64(
void cuda_integer_signed_scalar_div_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -617,7 +628,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void cuda_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -626,10 +637,10 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void const *clear_blocks, void const *h_clear_blocks,
uint32_t num_clear_blocks);
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -639,7 +650,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
uint32_t const active_bits_divisor, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
void cuda_integer_signed_scalar_div_rem_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -647,10 +658,10 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits);
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
uint64_t scratch_integer_count_of_consecutive_bits_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -659,13 +670,13 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
Direction direction, BitValue bit_value, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_count_of_consecutive_bits_kb_64(
void cuda_integer_count_of_consecutive_bits_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_grouped_oprf_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
@@ -676,16 +687,16 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
bool allocate_gpu_memory, uint32_t message_bits_per_block,
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input,
uint32_t num_blocks_to_process,
int8_t *mem, void *const *bsks);
void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input,
uint32_t num_blocks_to_process, int8_t *mem,
void *const *bsks);
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_integer_ilog2_kb_64(
uint64_t scratch_integer_ilog2_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -694,7 +705,7 @@ uint64_t scratch_integer_ilog2_kb_64(
uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_ilog2_kb_64(
void cuda_integer_ilog2_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
@@ -702,8 +713,8 @@ void cuda_integer_ilog2_kb_64(
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
} // extern C
#endif // CUDA_INTEGER_H

View File

@@ -111,9 +111,9 @@ aes_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI *data,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, data, data, bsks, ksks, mem->luts->flush_lut,
data->num_radix_blocks);
integer_radix_apply_univariate_lookup_table<Torus>(streams, data, data, bsks,
ksks, mem->luts->flush_lut,
data->num_radix_blocks);
}
/**
@@ -126,8 +126,8 @@ __host__ __forceinline__ void aes_scalar_add_one_flush_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *data,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, data, mem->params.message_modulus, mem->params.carry_modulus);
host_add_scalar_one_inplace<Torus>(streams, data, mem->params.message_modulus,
mem->params.carry_modulus);
aes_flush_inplace(streams, data, mem, bsks, ksks);
}
@@ -167,7 +167,7 @@ batch_vec_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI **targets,
&dest_slice, targets[i]);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &batch_out, &batch_in, bsks, ksks, mem->luts->flush_lut,
batch_out.num_radix_blocks);
@@ -220,7 +220,7 @@ __host__ void batch_vec_and_inplace(CudaStreams streams,
&dest_rhs_slice, rhs[i]);
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &batch_out, &batch_lhs, &batch_rhs, bsks, ksks,
mem->luts->and_lut, batch_out.num_radix_blocks,
mem->params.message_modulus);
@@ -358,9 +358,9 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
#define ADD_ONE(target) \
do { \
host_integer_radix_add_scalar_one_inplace<Torus>( \
streams, target, mem->params.message_modulus, \
mem->params.carry_modulus); \
host_add_scalar_one_inplace<Torus>(streams, target, \
mem->params.message_modulus, \
mem->params.carry_modulus); \
} while (0)
// Homomorphic S-Box Circuit Evaluation
@@ -1057,7 +1057,7 @@ __host__ void vectorized_aes_full_adder_inplace(
// The carry_lut applies the function f(x) = (x >> 1) & 1, which
// extracts the carry bit from the previous sum. The result is stored
// in carry_vec for the next iteration (i+1).
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, carry_vec, sum_plus_carry_vec, bsks, ksks,
mem->luts->carry_lut, num_aes_inputs);
@@ -1065,7 +1065,7 @@ __host__ void vectorized_aes_full_adder_inplace(
// The flush_lut applies the function f(x) = x & 1, which extracts
// the least significant bit of the sum. The result is written
// directly into the state buffer, updating the IV in-place.
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &a_i_vec, sum_plus_carry_vec, bsks, ksks, mem->luts->flush_lut,
num_aes_inputs);
}
@@ -1221,9 +1221,9 @@ __host__ void host_integer_key_expansion(CudaStreams streams,
CudaRadixCiphertextFFI first_byte_bit_slice;
as_radix_ciphertext_slice<Torus>(&first_byte_bit_slice,
&rotated_word_buffer, bit, bit + 1);
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, &first_byte_bit_slice, mem->params.message_modulus,
mem->params.carry_modulus);
host_add_scalar_one_inplace<Torus>(streams, &first_byte_bit_slice,
mem->params.message_modulus,
mem->params.carry_modulus);
}
}

View File

@@ -1,6 +1,6 @@
#include "integer/abs.cuh"
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
CudaStreamsFFI streams, int8_t **mem_ptr, bool is_signed,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -14,19 +14,19 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_abs_kb<uint64_t>(
return scratch_cuda_integer_abs<uint64_t>(
CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
num_blocks, params, allocate_gpu_memory);
}
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void cuda_integer_abs_inplace_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
bool is_signed, void *const *bsks, void *const *ksks) {
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
(uint64_t **)(ksks), mem, is_signed);
host_integer_abs<uint64_t>(CudaStreams(streams), ct, bsks,
(uint64_t **)(ksks), mem, is_signed);
}
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,

View File

@@ -10,9 +10,11 @@
#include "radix_ciphertext.cuh"
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_abs_kb(
CudaStreams streams, int_abs_buffer<Torus> **mem_ptr, bool is_signed,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
__host__ uint64_t scratch_cuda_integer_abs(CudaStreams streams,
int_abs_buffer<Torus> **mem_ptr,
bool is_signed, uint32_t num_blocks,
int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
if (is_signed) {
@@ -23,10 +25,10 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
}
template <typename Torus>
__host__ void
host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
__host__ void host_integer_abs(CudaStreams streams, CudaRadixCiphertextFFI *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr,
bool is_signed) {
if (!is_signed)
return;
@@ -39,7 +41,7 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
mask, ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
@@ -52,8 +54,7 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
mem_ptr->scp_mem, bsks, ksks,
requested_flag, uses_carry);
host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
bsks, ksks);
host_bitop<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem, bsks, ksks);
}
#endif // TFHE_RS_ABS_CUH

View File

@@ -1,6 +1,6 @@
#include "integer/bitwise_ops.cuh"
uint64_t scratch_cuda_integer_radix_bitop_kb_64(
uint64_t scratch_cuda_bitop_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,20 +14,21 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
return scratch_cuda_bitop<uint64_t>(
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
}
void cuda_bitop_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
host_integer_radix_bitop_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
host_bitop<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

View File

@@ -10,11 +10,12 @@
#include "pbs/programmable_bootstrap_multibit.cuh"
template <typename Torus>
__host__ void host_integer_radix_bitop_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
__host__ void host_bitop(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
@@ -41,7 +42,7 @@ __host__ void host_integer_radix_bitop_kb(
lwe_array_1->num_radix_blocks);
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, lut,
lwe_array_out->num_radix_blocks, lut->params.message_modulus);
@@ -50,10 +51,11 @@ __host__ void host_integer_radix_bitop_kb(
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
CudaStreams streams, int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
__host__ uint64_t scratch_cuda_bitop(CudaStreams streams,
int_bitop_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks,
int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,

View File

@@ -78,8 +78,8 @@ __host__ void host_extend_radix_with_sign_msb(
streams.stream(0), streams.gpu_index(0), mem_ptr->last_block, 0, 1, input,
input_blocks - 1, input_blocks);
host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
host_apply_univariate_lut(streams, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
uint32_t dst_block_idx = input_blocks + i;

View File

@@ -1,13 +1,15 @@
#include "integer/cmux.cuh"
uint64_t scratch_cuda_integer_radix_cmux_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch cmux")
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -17,29 +19,29 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
std::function<uint64_t(uint64_t)> predicate_lut_f =
[](uint64_t x) -> uint64_t { return x == 1; };
uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
uint64_t ret = scratch_cuda_cmux<uint64_t>(
CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
POP_RANGE()
return ret;
}
void cuda_cmux_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
PUSH_RANGE("cmux")
host_integer_radix_cmux_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
host_cmux<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_condition,
lwe_array_true, lwe_array_false,
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
POP_RANGE()
}
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup cmux")
int_cmux_buffer<uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -28,7 +28,7 @@ __host__ void zero_out_if(CudaStreams streams,
cuda_set_device(streams.gpu_index(0));
auto params = mem_ptr->params;
// We can't use integer_radix_apply_bivariate_lookup_table_kb since the
// We can't use integer_radix_apply_bivariate_lookup_table since the
// second operand is not an array
auto tmp_lwe_array_input = mem_ptr->tmp;
host_pack_bivariate_blocks_with_single_block<Torus>(
@@ -36,18 +36,19 @@ __host__ void zero_out_if(CudaStreams streams,
lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks, predicate,
num_radix_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_cmux_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
__host__ void host_cmux(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -69,7 +70,7 @@ __host__ void host_integer_radix_cmux_kb(
streams.stream(0), streams.gpu_index(0), mem_ptr->condition_array, i,
i + 1, lwe_condition, 0, 1);
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
2 * num_radix_blocks, params.message_modulus);
@@ -88,16 +89,18 @@ __host__ void host_integer_radix_cmux_kb(
&mem_true, &mem_false, num_radix_blocks,
params.message_modulus, params.carry_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, &mem_true, bsks, ksks,
mem_ptr->message_extract_lut, num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
CudaStreams streams, int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
__host__ uint64_t scratch_cuda_cmux(CudaStreams streams,
int_cmux_buffer<Torus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f,
uint32_t num_radix_blocks,
int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory,

View File

@@ -1,6 +1,6 @@
#include "integer/comparison.cuh"
uint64_t scratch_cuda_integer_radix_comparison_kb_64(
uint64_t scratch_cuda_comparison_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -18,7 +18,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
switch (op_type) {
case EQ:
case NE:
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
size_tracker += scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, op_type, false, allocate_gpu_memory);
break;
@@ -28,7 +28,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
case LE:
case MAX:
case MIN:
size_tracker += scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
size_tracker += scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
break;
@@ -37,11 +37,12 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
return size_tracker;
}
void cuda_comparison_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
PUSH_RANGE("comparison")
if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
PANIC("Cuda error: input num radix blocks must be the same")
@@ -54,9 +55,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
switch (buffer->op) {
case EQ:
case NE:
host_integer_radix_equality_check_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
bsks, (uint64_t **)(ksks), num_radix_blocks);
host_equality_check<uint64_t>(CudaStreams(streams), lwe_array_out,
lwe_array_1, lwe_array_2, buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
break;
case GT:
case GE:
@@ -65,18 +66,18 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
if (num_radix_blocks % 2 != 0)
PANIC("Cuda error (comparisons): the number of radix blocks has to be "
"even.")
host_integer_radix_difference_check_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
num_radix_blocks);
host_difference_check<uint64_t>(CudaStreams(streams), lwe_array_out,
lwe_array_1, lwe_array_2, buffer,
buffer->diff_buffer->operator_f, bsks,
(uint64_t **)(ksks), num_radix_blocks);
break;
case MAX:
case MIN:
if (num_radix_blocks % 2 != 0)
PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
host_integer_radix_maxmin_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
bsks, (uint64_t **)(ksks), num_radix_blocks);
host_maxmin<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
lwe_array_2, buffer, bsks, (uint64_t **)(ksks),
num_radix_blocks);
break;
default:
PANIC("Cuda error: integer operation not supported")
@@ -95,7 +96,7 @@ void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
POP_RANGE()
}
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -109,12 +110,12 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
return scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
}
void cuda_integer_are_all_comparisons_block_true_kb_64(
void cuda_integer_are_all_comparisons_block_true_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
@@ -122,7 +123,7 @@ void cuda_integer_are_all_comparisons_block_true_kb_64(
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
host_integer_are_all_comparisons_block_true<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
}
@@ -137,7 +138,7 @@ void cleanup_cuda_integer_are_all_comparisons_block_true(
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -151,12 +152,12 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
return scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
}
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void cuda_integer_is_at_least_one_comparisons_block_true_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
@@ -164,7 +165,7 @@ void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
host_integer_is_at_least_one_comparisons_block_true<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
}

View File

@@ -155,7 +155,7 @@ __host__ void are_all_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
@@ -172,7 +172,7 @@ __host__ void are_all_comparisons_block_true(
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, tmp_out, accumulator, bsks, ksks, lut, num_chunks);
}
}
@@ -241,12 +241,12 @@ __host__ void is_at_least_one_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
lut, 1);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
bsks, ksks, lut, num_chunks);
}
@@ -314,19 +314,19 @@ __host__ void host_compare_blocks_with_zero(
}
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, sum, bsks, ksks, zero_comparison, num_sum_blocks);
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_equality_check_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
__host__ void
host_equality_check(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -335,7 +335,7 @@ __host__ void host_integer_radix_equality_check_kb(
// Applies the LUT for the comparison operation
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
eq_buffer->operator_lut, num_radix_blocks,
eq_buffer->operator_lut->params.message_modulus);
@@ -349,12 +349,12 @@ __host__ void host_integer_radix_equality_check_kb(
}
template <typename Torus>
__host__ void compare_radix_blocks_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
__host__ void
compare_radix_blocks(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -386,15 +386,15 @@ __host__ void compare_radix_blocks_kb(
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_out, bsks, ksks, is_non_zero_lut,
num_radix_blocks);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, lwe_array_out, message_modulus, carry_modulus);
host_add_scalar_one_inplace<Torus>(streams, lwe_array_out, message_modulus,
carry_modulus);
}
// Reduces a vec containing shortint blocks that encrypts a sign
@@ -439,7 +439,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), y, x,
partial_block_count, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, x, y, bsks, ksks, inner_tree_leaf, partial_block_count >> 1);
if ((partial_block_count % 2) != 0) {
@@ -485,12 +485,12 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
last_lut->broadcast_lut(active_streams);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, y, bsks, ksks, last_lut, 1);
integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,
bsks, ksks, last_lut, 1);
}
template <typename Torus>
__host__ void host_integer_radix_difference_check_kb(
__host__ void host_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
@@ -534,7 +534,7 @@ __host__ void host_integer_radix_difference_check_kb(
// Clean noise
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
identity_lut, 2 * packed_num_radix_blocks);
} else {
@@ -553,15 +553,15 @@ __host__ void host_integer_radix_difference_check_kb(
if (!mem_ptr->is_signed) {
// Compare packed blocks, or simply the total number of radix blocks in the
// inputs
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, packed_num_radix_blocks);
compare_radix_blocks<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr, bsks,
ksks, packed_num_radix_blocks);
num_comparisons = packed_num_radix_blocks;
} else {
// Packing is possible
if (carry_modulus >= message_modulus) {
// Compare (num_radix_blocks - 2) / 2 packed blocks
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, packed_num_radix_blocks);
compare_radix_blocks<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, packed_num_radix_blocks);
// Compare the last block before the sign block separately
auto identity_lut = mem_ptr->identity_lut;
@@ -573,7 +573,7 @@ __host__ void host_integer_radix_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_left, lwe_array_left,
num_radix_blocks - 2,
num_radix_blocks - 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
bsks, ksks, identity_lut, 1);
@@ -586,7 +586,7 @@ __host__ void host_integer_radix_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&shifted_lwe_array_right,
lwe_array_right, num_radix_blocks - 2,
num_radix_blocks - 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &last_right_block_before_sign_block,
&shifted_lwe_array_right, bsks, ksks, identity_lut, 1);
@@ -594,7 +594,7 @@ __host__ void host_integer_radix_difference_check_kb(
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
packed_num_radix_blocks,
packed_num_radix_blocks + 1);
compare_radix_blocks_kb<Torus>(
compare_radix_blocks<Torus>(
streams, &shifted_comparisons, &last_left_block_before_sign_block,
&last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);
@@ -608,16 +608,16 @@ __host__ void host_integer_radix_difference_check_kb(
CudaRadixCiphertextFFI last_right_block;
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
num_radix_blocks - 1, num_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &shifted_comparisons, &last_left_block, &last_right_block,
bsks, ksks, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
num_comparisons = packed_num_radix_blocks + 2;
} else {
compare_radix_blocks_kb<Torus>(streams, comparisons, lwe_array_left,
lwe_array_right, mem_ptr, bsks, ksks,
num_radix_blocks - 1);
compare_radix_blocks<Torus>(streams, comparisons, lwe_array_left,
lwe_array_right, mem_ptr, bsks, ksks,
num_radix_blocks - 1);
// Compare the sign block separately
CudaRadixCiphertextFFI shifted_comparisons;
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -628,7 +628,7 @@ __host__ void host_integer_radix_difference_check_kb(
CudaRadixCiphertextFFI last_right_block;
as_radix_ciphertext_slice<Torus>(&last_right_block, lwe_array_right,
num_radix_blocks - 1, num_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &shifted_comparisons, &last_left_block, &last_right_block,
bsks, ksks, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
@@ -645,7 +645,7 @@ __host__ void host_integer_radix_difference_check_kb(
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
__host__ uint64_t scratch_cuda_comparison_check(
CudaStreams streams, int_comparison_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool is_signed, bool allocate_gpu_memory) {
@@ -658,12 +658,12 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
}
template <typename Torus>
__host__ void host_integer_radix_maxmin_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
__host__ void
host_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -675,18 +675,18 @@ __host__ void host_integer_radix_maxmin_kb(
"than the number of blocks to operate on")
// Compute the sign
host_integer_radix_difference_check_kb<Torus>(
host_difference_check<Torus>(
streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks);
// Selector
host_integer_radix_cmux_kb<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
host_cmux<Torus>(streams, lwe_array_out, mem_ptr->tmp_lwe_array_out,
lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks);
}
template <typename Torus>
__host__ void host_integer_are_all_comparisons_block_true_kb(
__host__ void host_integer_are_all_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -699,7 +699,7 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
}
template <typename Torus>
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
__host__ void host_integer_is_at_least_one_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,

View File

@@ -13,7 +13,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
return scratch_cuda_compress_ciphertext<uint64_t>(
CudaStreams(streams), (int_compression<uint64_t> **)mem_ptr,
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
}
@@ -93,7 +93,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
return scratch_cuda_compress_ciphertext<__uint128_t>(
CudaStreams(streams), (int_compression<__uint128_t> **)mem_ptr,
num_radix_blocks, compression_params, lwe_per_glwe, allocate_gpu_memory);
}

View File

@@ -401,7 +401,7 @@ host_integer_decompress(CudaStreams streams,
}
template <typename Torus>
__host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
__host__ uint64_t scratch_cuda_compress_ciphertext(
CudaStreams streams, int_compression<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params compression_params,
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {

View File

@@ -1,6 +1,6 @@
#include "integer/div_rem.cuh"
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
CudaStreamsFFI streams, bool is_signed, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -14,13 +14,13 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_div_rem_kb<uint64_t>(
return scratch_cuda_integer_div_rem<uint64_t>(
CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
num_blocks, params, allocate_gpu_memory);
POP_RANGE()
}
void cuda_integer_div_rem_radix_ciphertext_kb_64(
void cuda_integer_div_rem_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
@@ -28,9 +28,9 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
PUSH_RANGE("div")
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
host_integer_div_rem_kb<uint64_t>(CudaStreams(streams), quotient, remainder,
numerator, divisor, is_signed, bsks,
(uint64_t **)(ksks), mem);
host_integer_div_rem<uint64_t>(CudaStreams(streams), quotient, remainder,
numerator, divisor, is_signed, bsks,
(uint64_t **)(ksks), mem);
POP_RANGE()
}

View File

@@ -14,7 +14,7 @@
#include <fstream>
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_div_rem_kb(
__host__ uint64_t scratch_cuda_integer_div_rem(
CudaStreams streams, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
__host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
@@ -85,7 +85,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
// Computes 2*d by extending and shifting on gpu[1]
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
mem_ptr->d2, divisor_gpu_1, streams.get_ith(1));
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams.get_ith(1), mem_ptr->d2, 1, mem_ptr->shift_mem, &bsks[1],
&ksks[1], mem_ptr->d2->num_radix_blocks);
@@ -250,14 +250,14 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
auto o3 = mem_ptr->sub_1_overflowed;
// used as a bitor
host_integer_radix_bitop_kb(streams.get_ith(0), o3, o3, mem_ptr->cmp_1,
mem_ptr->bitor_mem_1, &bsks[0], &ksks[0]);
host_bitop(streams.get_ith(0), o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
&bsks[0], &ksks[0]);
// used as a bitor
host_integer_radix_bitop_kb(streams.get_ith(1), o2, o2, mem_ptr->cmp_2,
mem_ptr->bitor_mem_2, &bsks[1], &ksks[1]);
host_bitop(streams.get_ith(1), o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
&bsks[1], &ksks[1]);
// used as a bitor
host_integer_radix_bitop_kb(streams.get_ith(2), o1, o1, mem_ptr->cmp_3,
mem_ptr->bitor_mem_3, &bsks[2], &ksks[2]);
host_bitop(streams.get_ith(2), o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
&bsks[2], &ksks[2]);
// cmp_1, cmp_2, cmp_3 are not needed anymore, we can reuse them as c3,
// c2, c1. c0 is allocated on gpu[3], we take it from mem_ptr.
@@ -337,7 +337,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
host_add_the_same_block_to_all_blocks<Torus>(streams.stream(gpu_index),
streams.gpu_index(gpu_index),
rx, rx, cx, 4, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams.get_ith(gpu_index), rx, rx, &bsks[gpu_index],
&ksks[gpu_index], lut, rx->num_radix_blocks);
};
@@ -355,15 +355,15 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
mem_ptr->zero_out_if_not_1_lut_2, 2);
// calculate quotient bits GPU[2]
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem_ptr->sub_streams_1.get_ith(2), mem_ptr->q1, c1, &bsks[2], &ksks[2],
mem_ptr->quotient_lut_1, 1);
// calculate quotient bits GPU[1]
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem_ptr->sub_streams_1.get_ith(1), mem_ptr->q2, c2, &bsks[1], &ksks[1],
mem_ptr->quotient_lut_2, 1);
// calculate quotient bits GPU[0]
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem_ptr->sub_streams_1.get_ith(0), mem_ptr->q3, c3, &bsks[0], &ksks[0],
mem_ptr->quotient_lut_3, 1);
@@ -427,10 +427,10 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
streams.synchronize();
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, rem_gpu_0, rem_gpu_0, bsks, ksks,
mem_ptr->message_extract_lut_1, rem_gpu_0->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem_ptr->sub_streams_1, q3_gpu_0, q3_gpu_0, bsks, ksks,
mem_ptr->message_extract_lut_2, 1);
streams.synchronize();
@@ -469,7 +469,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb(
__host__ void host_unsigned_integer_div_rem(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
@@ -486,7 +486,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
if (mem_ptr->params.message_modulus == 4 &&
mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
host_unsigned_integer_div_rem_block_by_block_2_2<Torus>(
streams, quotient, remainder, numerator, divisor, bsks, ksks,
mem_ptr->div_rem_2_2_mem);
return;
@@ -587,7 +587,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
&last_interesting_divisor_block, interesting_divisor,
interesting_divisor->num_radix_blocks - 1,
interesting_divisor->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &last_interesting_divisor_block,
&last_interesting_divisor_block, bsks, ksks,
mem_ptr->masking_luts_1[shifted_mask], 1);
@@ -614,7 +614,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
mem_ptr->masking_luts_2[shifted_mask], 1);
}; // trim_first_divisor_ms_bits
@@ -636,7 +636,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
streams.stream(0), streams.gpu_index(0), mem_ptr->numerator_block_1,
interesting_remainder1, 0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
interesting_remainder1->num_radix_blocks);
@@ -665,7 +665,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
}; // left_shift_interesting_remainder1
auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
interesting_remainder2->num_radix_blocks);
}; // left_shift_interesting_remainder2
@@ -773,7 +773,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// fills:
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder = [&](CudaStreams streams) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, cleaned_merged_interesting_remainder,
cleaned_merged_interesting_remainder, bsks, ksks,
mem_ptr->message_extract_lut_1,
@@ -811,7 +811,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto conditionally_zero_out_merged_interesting_remainder =
[&](CudaStreams streams) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, cleaned_merged_interesting_remainder,
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
@@ -820,7 +820,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto conditionally_zero_out_merged_new_remainder =
[&](CudaStreams streams) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
ksks, mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
new_remainder->num_radix_blocks, factor);
@@ -828,7 +828,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto set_quotient_bit = [&](CudaStreams streams) {
uint32_t block_of_bit = i / num_bits_in_message;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, mem_ptr->did_not_overflow, subtraction_overflowed,
at_least_one_upper_block_is_non_zero, bsks, ksks,
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
@@ -887,10 +887,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
streams.synchronize();
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
mem_ptr->message_extract_lut_1, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
mem_ptr->message_extract_lut_2, num_blocks);
@@ -899,7 +899,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
}
template <typename Torus>
__host__ void host_integer_div_rem_kb(
__host__ void host_integer_div_rem(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
@@ -927,15 +927,15 @@ __host__ void host_integer_div_rem_kb(
streams.synchronize();
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
bsks, ksks, int_mem_ptr->abs_mem_1, true);
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
bsks, ksks, int_mem_ptr->abs_mem_2, true);
host_integer_abs<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
bsks, ksks, int_mem_ptr->abs_mem_1, true);
host_integer_abs<Torus>(int_mem_ptr->sub_streams_2, positive_divisor, bsks,
ksks, int_mem_ptr->abs_mem_2, true);
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
host_unsigned_integer_div_rem_kb<Torus>(
host_unsigned_integer_div_rem<Torus>(
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
@@ -945,7 +945,7 @@ __host__ void host_integer_div_rem_kb(
CudaRadixCiphertextFFI divisor_sign;
as_radix_ciphertext_slice<Torus>(&divisor_sign, divisor, num_blocks - 1,
num_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
&numerator_sign, &divisor_sign, bsks, ksks,
int_mem_ptr->compare_signed_bits_lut, 1,
@@ -954,7 +954,7 @@ __host__ void host_integer_div_rem_kb(
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
host_integer_radix_negation<Torus>(
host_negation<Torus>(
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, quotient,
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
@@ -965,7 +965,7 @@ __host__ void host_integer_div_rem_kb(
nullptr, int_mem_ptr->scp_mem_1, bsks,
ksks, requested_flag, uses_carry);
host_integer_radix_negation<Torus>(
host_negation<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
@@ -974,22 +974,21 @@ __host__ void host_integer_div_rem_kb(
nullptr, int_mem_ptr->scp_mem_2, bsks,
ksks, requested_flag, uses_carry);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
host_cmux<Torus>(int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different,
int_mem_ptr->negated_quotient, quotient,
int_mem_ptr->cmux_quotient_mem, bsks, ksks);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
int_mem_ptr->negated_remainder, remainder,
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
host_cmux<Torus>(int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
int_mem_ptr->negated_remainder, remainder,
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
} else {
host_unsigned_integer_div_rem_kb<Torus>(streams, quotient, remainder,
numerator, divisor, bsks, ksks,
int_mem_ptr->unsigned_mem);
host_unsigned_integer_div_rem<Torus>(streams, quotient, remainder,
numerator, divisor, bsks, ksks,
int_mem_ptr->unsigned_mem);
}
}

View File

@@ -1,6 +1,6 @@
#include "ilog2.cuh"
uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
uint64_t scratch_integer_count_of_consecutive_bits_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -26,7 +26,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
// the leading or trailing end of an encrypted integer. The final count is
// stored in the output ciphertext.
//
void cuda_integer_count_of_consecutive_bits_kb_64(
void cuda_integer_count_of_consecutive_bits_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
@@ -37,8 +37,8 @@ void cuda_integer_count_of_consecutive_bits_kb_64(
(uint64_t **)ksks);
}
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_count_of_consecutive_bits_buffer<uint64_t> *mem_ptr =
(int_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr_void);
@@ -49,7 +49,7 @@ void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_ilog2_kb_64(
uint64_t scratch_integer_ilog2_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -74,7 +74,7 @@ uint64_t scratch_integer_ilog2_kb_64(
// This is equivalent to finding the position of the most significant bit.
// The result is stored in the output ciphertext.
//
void cuda_integer_ilog2_kb_64(
void cuda_integer_ilog2_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
@@ -88,8 +88,8 @@ void cuda_integer_ilog2_kb_64(
(uint64_t **)ksks);
}
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_ilog2_buffer<uint64_t> *mem_ptr =
(int_ilog2_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -14,8 +14,8 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(
auto tmp = mem_ptr->tmp_ct;
host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
mem_ptr->univ_lut_mem, ksks, bsks);
host_apply_univariate_lut<Torus>(streams, tmp, ciphertext,
mem_ptr->univ_lut_mem, ksks, bsks);
if (mem_ptr->direction == Leading) {
host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
@@ -72,7 +72,7 @@ __host__ void host_integer_count_of_consecutive_bits(
output_start_index + 1, ct_prepared, i, i + 1);
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
host_integer_partial_sum_ciphertexts_vec<Torus>(
streams, output_ct, cts, bsks, ksks, mem_ptr->sum_mem, counter_num_blocks,
ct_prepared->num_radix_blocks);
@@ -141,19 +141,19 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
// Perform a partial sum of all the elements without carry propagation.
//
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
host_integer_partial_sum_ciphertexts_vec<Torus>(
streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
ksks, mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
mem_ptr->input_num_blocks + 1);
// Apply luts to the partial sum.
//
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->message_blocks_not,
mem_ptr->sum_output_not_propagated,
mem_ptr->lut_message_not, ksks, bsks);
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->carry_blocks_not,
mem_ptr->sum_output_not_propagated,
mem_ptr->lut_carry_not, ksks, bsks);
host_apply_univariate_lut<Torus>(streams, mem_ptr->message_blocks_not,
mem_ptr->sum_output_not_propagated,
mem_ptr->lut_message_not, ksks, bsks);
host_apply_univariate_lut<Torus>(streams, mem_ptr->carry_blocks_not,
mem_ptr->sum_output_not_propagated,
mem_ptr->lut_carry_not, ksks, bsks);
// Left-shift the bitwise-negated carry blocks by one position.
//
@@ -190,7 +190,7 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
2 * mem_ptr->counter_num_blocks, 3 * mem_ptr->counter_num_blocks,
trivial_ct_2, 0, mem_ptr->counter_num_blocks);
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
host_integer_partial_sum_ciphertexts_vec<Torus>(
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks, mem_ptr->sum_mem,
mem_ptr->counter_num_blocks, 3);

View File

@@ -43,7 +43,7 @@ void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
uint64_t scratch_cuda_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -56,12 +56,12 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, allocate_gpu_memory);
}
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -74,12 +74,12 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -97,7 +97,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
num_blocks, params, compute_overflow, allocate_gpu_memory);
}
void cuda_propagate_single_carry_kb_64_inplace(
void cuda_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
@@ -109,7 +109,7 @@ void cuda_propagate_single_carry_kb_64_inplace(
requested_flag, uses_carry);
}
void cuda_add_and_propagate_single_carry_kb_64_inplace(
void cuda_add_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
@@ -121,7 +121,7 @@ void cuda_add_and_propagate_single_carry_kb_64_inplace(
requested_flag, uses_carry);
}
void cuda_integer_overflowing_sub_kb_64_inplace(
void cuda_integer_overflowing_sub_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
@@ -168,7 +168,7 @@ void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
POP_RANGE()
}
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
uint64_t scratch_cuda_apply_univariate_lut_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -182,13 +182,13 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_univariate_lut_kb<uint64_t>(
return scratch_cuda_apply_univariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
uint64_t scratch_cuda_apply_many_univariate_lut_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -202,24 +202,25 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
return scratch_cuda_apply_many_univariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
num_many_lut, lut_degree, allocate_gpu_memory);
}
void cuda_apply_univariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks) {
void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks) {
host_apply_univariate_lut_kb<uint64_t>(
host_apply_univariate_lut<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup univar lut")
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -228,19 +229,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
POP_RANGE()
}
void cuda_apply_many_univariate_lut_kb_64(
void cuda_apply_many_univariate_lut_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
uint32_t lut_stride) {
host_apply_many_univariate_lut_kb<uint64_t>(
host_apply_many_univariate_lut<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_many_lut, lut_stride);
}
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
uint64_t scratch_cuda_apply_bivariate_lut_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
@@ -254,27 +255,27 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
return scratch_cuda_apply_bivariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
void cuda_apply_bivariate_lut_kb_64(
void cuda_apply_bivariate_lut_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe_1,
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift) {
host_apply_bivariate_lut_kb<uint64_t>(
host_apply_bivariate_lut<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
(uint64_t **)(ksks), bsks, num_radix_blocks, shift);
}
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup bivar lut")
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -298,7 +299,7 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
return scratch_cuda_apply_bivariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
@@ -360,7 +361,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
return size_tracker;
}
uint64_t scratch_cuda_apply_noise_squashing_kb(
uint64_t scratch_cuda_apply_noise_squashing(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_glwe_dimension, uint32_t input_polynomial_size,
@@ -381,20 +382,21 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
original_num_blocks, allocate_gpu_memory);
}
void cuda_apply_noise_squashing_kb(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks) {
void cuda_apply_noise_squashing(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks) {
PUSH_RANGE("apply noise squashing")
integer_radix_apply_noise_squashing_kb<uint64_t>(
integer_radix_apply_noise_squashing<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
POP_RANGE()
}
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup noise squashing")
int_noise_squashing_lut<uint64_t> *mem_ptr =
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);

View File

@@ -503,7 +503,7 @@ __host__ void host_pack_bivariate_blocks_with_single_block(
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
/// the input and output numbers of blocks
template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
__host__ void integer_radix_apply_univariate_lookup_table(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
@@ -607,7 +607,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
}
template <typename Torus>
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
__host__ void integer_radix_apply_many_univariate_lookup_table(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
@@ -710,7 +710,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
}
template <typename Torus>
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
__host__ void integer_radix_apply_bivariate_lookup_table(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
@@ -1279,7 +1279,7 @@ void host_compute_shifted_blocks_and_states(
auto shifted_blocks_and_states = mem->shifted_blocks_and_states;
auto luts_array_first_step = mem->luts_array_first_step;
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
integer_radix_apply_many_univariate_lookup_table<Torus>(
streams, shifted_blocks_and_states, lwe_array, bsks, ksks,
luts_array_first_step, num_many_lut, lut_stride);
@@ -1347,7 +1347,7 @@ void host_resolve_group_carries_sequentially(
as_radix_ciphertext_slice<Torus>(&shifted_group_resolved_carries,
group_resolved_carries, 1,
blocks_to_solve + 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, &shifted_group_resolved_carries,
&shifted_group_resolved_carries, bsks, ksks, luts_sequential,
blocks_to_solve);
@@ -1388,7 +1388,7 @@ void host_compute_prefix_sum_hillis_steele(
auto prev_blocks = generates_or_propagates;
int cur_total_blocks = num_radix_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks, luts,
cur_total_blocks, luts->params.message_modulus);
@@ -1426,11 +1426,11 @@ void host_compute_propagation_simulators_and_group_carries(
block_states, num_radix_blocks, group_size);
auto luts_array_second_step = mem->luts_array_second_step;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, propagation_cum_sums, propagation_cum_sums, bsks, ksks,
luts_array_second_step, num_radix_blocks);
host_integer_radix_scalar_addition_inplace<Torus>(
host_scalar_addition_inplace<Torus>(
streams, propagation_cum_sums, mem->scalar_array_cum_sum,
mem->h_scalar_array_cum_sum, num_radix_blocks, message_modulus,
carry_modulus);
@@ -1478,7 +1478,7 @@ void host_compute_shifted_blocks_and_borrow_states(
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
auto luts_array_first_step = mem->luts_array_first_step;
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
integer_radix_apply_many_univariate_lookup_table<Torus>(
streams, shifted_blocks_and_borrow_states, lwe_array, bsks, ksks,
luts_array_first_step, num_many_lut, lut_stride);
@@ -1682,7 +1682,7 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
num_radix_blocks);
}
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
effective_num_radix_blocks);
}
@@ -1738,7 +1738,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
while (num_sign_blocks > 2) {
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
signs_a, num_sign_blocks, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, signs_a, signs_b, bsks, ksks, lut, num_sign_blocks / 2);
if (num_sign_blocks % 2 == 1)
@@ -1768,7 +1768,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
signs_a, num_sign_blocks, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, signs_array_out, signs_b, bsks, ksks, lut, 1);
} else {
@@ -1786,13 +1786,13 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
diff_buffer->preallocated_h_lut2);
lut->broadcast_lut(lut->active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
}
}
template <typename Torus>
uint64_t scratch_cuda_apply_univariate_lut_kb(
uint64_t scratch_cuda_apply_univariate_lut(
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
bool allocate_gpu_memory) {
@@ -1814,19 +1814,19 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
}
template <typename Torus>
void host_apply_univariate_lut_kb(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks) {
void host_apply_univariate_lut(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
radix_lwe_out->num_radix_blocks);
}
template <typename Torus>
uint64_t scratch_cuda_apply_many_univariate_lut_kb(
uint64_t scratch_cuda_apply_many_univariate_lut(
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params, uint32_t num_many_lut,
uint64_t lut_degree, bool allocate_gpu_memory) {
@@ -1849,19 +1849,21 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
}
template <typename Torus>
void host_apply_many_univariate_lut_kb(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
Torus *const *ksks, void *const *bsks, uint32_t num_many_lut,
uint32_t lut_stride) {
void host_apply_many_univariate_lut(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in,
int_radix_lut<Torus> *mem,
Torus *const *ksks, void *const *bsks,
uint32_t num_many_lut,
uint32_t lut_stride) {
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
integer_radix_apply_many_univariate_lookup_table<Torus>(
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem, num_many_lut,
lut_stride);
}
template <typename Torus>
uint64_t scratch_cuda_apply_bivariate_lut_kb(
uint64_t scratch_cuda_apply_bivariate_lut(
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
bool allocate_gpu_memory) {
@@ -1883,21 +1885,21 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
}
template <typename Torus>
void host_apply_bivariate_lut_kb(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in_1,
CudaRadixCiphertextFFI const *radix_lwe_in_2,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift) {
void host_apply_bivariate_lut(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in_1,
CudaRadixCiphertextFFI const *radix_lwe_in_2,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
num_radix_blocks, shift);
}
template <typename Torus>
uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
uint64_t scratch_cuda_propagate_single_carry_inplace(
CudaStreams streams, int_sc_prop_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
bool allocate_gpu_memory) {
@@ -1992,7 +1994,7 @@ void host_propagate_single_carry(CudaStreams streams,
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), prepared_blocks,
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, mem->output_flag, prepared_blocks, bsks, ksks,
mem->lut_message_extract, num_radix_blocks + 1);
@@ -2004,7 +2006,7 @@ void host_propagate_single_carry(CudaStreams streams,
mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
} else {
auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array, prepared_blocks, bsks, ksks, message_extract,
num_radix_blocks);
}
@@ -2077,7 +2079,7 @@ void host_add_and_propagate_single_carry(
auto block_states = mem->shifted_blocks_state_mem->block_states;
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks,
lut_overflow_prep, 1, lut_overflow_prep->params.message_modulus);
} else if (requested_flag == outputFlag::FLAG_CARRY) {
@@ -2140,7 +2142,7 @@ void host_add_and_propagate_single_carry(
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), prepared_blocks,
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, mem->output_flag, prepared_blocks, bsks, ksks,
mem->lut_message_extract, num_radix_blocks + 1);
@@ -2152,7 +2154,7 @@ void host_add_and_propagate_single_carry(
streams.stream(0), streams.gpu_index(0), carry_out, 0, 1,
mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lhs_array, prepared_blocks, bsks, ksks,
mem->lut_message_extract, num_radix_blocks);
}
@@ -2227,8 +2229,8 @@ void host_single_borrow_propagate(CudaStreams streams,
(Torus *)prepared_blocks->ptr, shifted_blocks,
simulators, big_lwe_dimension, num_radix_blocks);
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, prepared_blocks, message_modulus, carry_modulus);
host_add_scalar_one_inplace<Torus>(streams, prepared_blocks, message_modulus,
carry_modulus);
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
CudaRadixCiphertextFFI shifted_simulators;
@@ -2268,7 +2270,7 @@ void host_single_borrow_propagate(CudaStreams streams,
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
auto borrow_flag = mem->lut_borrow_flag;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
borrow_flag, 1);
}
@@ -2290,7 +2292,7 @@ void host_single_borrow_propagate(CudaStreams streams,
mem->group_size);
auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
message_extract, num_radix_blocks);
@@ -2308,7 +2310,7 @@ void host_single_borrow_propagate(CudaStreams streams,
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
/// the input and output numbers of blocks
template <typename InputTorus>
__host__ void integer_radix_apply_noise_squashing_kb(
__host__ void integer_radix_apply_noise_squashing(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,

View File

@@ -65,7 +65,7 @@ void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
* This scratch function allocates the necessary amount of data on the GPU for
* the integer radix multiplication in keyswitch->bootstrap order.
*/
uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
CudaStreamsFFI streams, int8_t **mem_ptr, bool const is_boolean_left,
bool const is_boolean_right, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
@@ -87,7 +87,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
case 4096:
case 8192:
case 16384:
return scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
return scratch_cuda_integer_mult_radix_ciphertext<uint64_t>(
CudaStreams(streams), (int_mul_memory<uint64_t> **)mem_ptr,
is_boolean_left, is_boolean_right, num_radix_blocks, params,
allocate_gpu_memory);
@@ -124,7 +124,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
* ciphertext
* - 'pbs_type' selects which PBS implementation should be used
*/
void cuda_integer_mult_radix_ciphertext_kb_64(
void cuda_integer_mult_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
@@ -133,43 +133,43 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
PUSH_RANGE("mul")
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
@@ -192,7 +192,7 @@ void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
POP_RANGE()
}
uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -207,30 +207,31 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
return scratch_cuda_integer_partial_sum_ciphertexts_vec<uint64_t>(
CudaStreams(streams),
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
allocate_gpu_memory);
}
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
host_integer_partial_sum_ciphertexts_vec<uint64_t>(
CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
(uint64_t **)(ksks), mem, radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
}
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

View File

@@ -268,7 +268,7 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec(
CudaStreams streams, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
@@ -283,7 +283,7 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
}
template <typename Torus>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
__host__ void host_integer_partial_sum_ciphertexts_vec(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
@@ -412,7 +412,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
luts_message_carry->broadcast_lut(active_streams, false);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, current_blocks, current_blocks, bsks, ksks,
luts_message_carry, total_ciphertexts);
}
@@ -463,7 +463,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
luts_message_carry->broadcast_lut(active_streams, false);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
active_streams, current_blocks, radix_lwe_out, bsks, ksks,
luts_message_carry, num_blocks_in_apply_lut);
}
@@ -483,7 +483,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
}
template <typename Torus, class params>
__host__ void host_integer_mult_radix_kb(
__host__ void host_integer_mult_radix(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
@@ -580,7 +580,7 @@ __host__ void host_integer_mult_radix_kb(
(Torus *)vector_lsb_rhs->ptr, (Torus *)vector_msb_rhs.ptr, num_blocks);
check_cuda_error(cudaGetLastError());
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
luts_array, total_block_count, luts_array->params.message_modulus);
@@ -608,7 +608,7 @@ __host__ void host_integer_mult_radix_kb(
size_t b_id = i % num_blocks;
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
host_integer_partial_sum_ciphertexts_vec<Torus>(
streams, radix_lwe_out, vector_result_sb, bsks, ksks,
mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
@@ -621,7 +621,7 @@ __host__ void host_integer_mult_radix_kb(
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext(
CudaStreams streams, int_mul_memory<Torus> **mem_ptr,
bool const is_boolean_left, bool const is_boolean_right,
uint32_t num_radix_blocks, int_radix_params params,

View File

@@ -1,11 +1,11 @@
#include "integer/negation.cuh"
void cuda_negate_integer_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
uint32_t carry_modulus, uint32_t num_radix_blocks) {
void cuda_negate_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
uint32_t message_modulus, uint32_t carry_modulus,
uint32_t num_radix_blocks) {
host_integer_radix_negation<uint64_t>(CudaStreams(streams), lwe_array_out,
lwe_array_in, message_modulus,
carry_modulus, num_radix_blocks);
host_negation<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_in,
message_modulus, carry_modulus, num_radix_blocks);
}

View File

@@ -17,10 +17,9 @@
#include <vector>
template <typename Torus>
__global__ void
device_integer_radix_negation(Torus *output, Torus const *input,
int32_t num_blocks, uint64_t lwe_dimension,
uint64_t message_modulus, uint64_t delta) {
__global__ void device_negation(Torus *output, Torus const *input,
int32_t num_blocks, uint64_t lwe_dimension,
uint64_t message_modulus, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < lwe_dimension + 1) {
bool is_body = (tid == lwe_dimension);
@@ -49,10 +48,11 @@ device_integer_radix_negation(Torus *output, Torus const *input,
}
template <typename Torus>
__host__ void host_integer_radix_negation(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, uint64_t message_modulus,
uint64_t carry_modulus, uint32_t num_radix_blocks) {
__host__ void host_negation(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
uint64_t message_modulus, uint64_t carry_modulus,
uint32_t num_radix_blocks) {
cuda_set_device(streams.gpu_index(0));
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
@@ -80,7 +80,7 @@ __host__ void host_integer_radix_negation(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
device_negation<Torus><<<grid, thds, 0, streams.stream(0)>>>(
static_cast<Torus *>(lwe_array_out->ptr),
static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
message_modulus, delta);

View File

@@ -21,11 +21,11 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
allocate_gpu_memory);
}
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input,
uint32_t num_blocks_to_process,
int8_t *mem, void *const *bsks) {
void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input,
uint32_t num_blocks_to_process, int8_t *mem,
void *const *bsks) {
host_integer_grouped_oprf<uint64_t>(
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,

View File

@@ -1,11 +1,11 @@
#include "integer/scalar_addition.cuh"
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void cuda_scalar_addition_ciphertext_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
void const *scalar_input, void const *h_scalar_input, uint32_t num_scalars,
uint32_t message_modulus, uint32_t carry_modulus) {
host_integer_radix_scalar_addition_inplace<uint64_t>(
host_scalar_addition_inplace<uint64_t>(
CudaStreams(streams), lwe_array,
static_cast<const uint64_t *>(scalar_input),
static_cast<const uint64_t *>(h_scalar_input), num_scalars,

View File

@@ -12,9 +12,10 @@
#include <stdio.h>
template <typename Torus>
__global__ void device_integer_radix_scalar_addition_inplace(
Torus *lwe_array, Torus const *scalar_input, int32_t num_blocks,
uint32_t lwe_dimension, uint64_t delta) {
__global__ void
device_scalar_addition_inplace(Torus *lwe_array, Torus const *scalar_input,
int32_t num_blocks, uint32_t lwe_dimension,
uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
@@ -24,7 +25,7 @@ __global__ void device_integer_radix_scalar_addition_inplace(
}
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
__host__ void host_scalar_addition_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
Torus const *scalar_input, Torus const *h_scalar_input,
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
@@ -45,10 +46,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_addition_inplace<Torus>
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
scalar_input, num_scalars,
lwe_array->lwe_dimension, delta);
device_scalar_addition_inplace<Torus><<<grid, thds, 0, streams.stream(0)>>>(
(Torus *)lwe_array->ptr, scalar_input, num_scalars,
lwe_array->lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
for (uint i = 0; i < num_scalars; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_addition_inplace(
}
template <typename Torus>
__global__ void device_integer_radix_add_scalar_one_inplace(
Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
uint64_t delta) {
__global__ void
device_add_scalar_one_inplace(Torus *lwe_array, int32_t num_blocks,
uint32_t lwe_dimension, uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
@@ -68,9 +68,10 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
}
template <typename Torus>
__host__ void host_integer_radix_add_scalar_one_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
uint32_t message_modulus, uint32_t carry_modulus) {
__host__ void host_add_scalar_one_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t message_modulus,
uint32_t carry_modulus) {
cuda_set_device(streams.gpu_index(0));
// Create a 1-dimensional grid of threads
@@ -85,10 +86,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_add_scalar_one_inplace<Torus>
<<<grid, thds, 0, streams.stream(0)>>>((Torus *)lwe_array->ptr,
lwe_array->num_radix_blocks,
lwe_array->lwe_dimension, delta);
device_add_scalar_one_inplace<Torus><<<grid, thds, 0, streams.stream(0)>>>(
(Torus *)lwe_array->ptr, lwe_array->num_radix_blocks,
lwe_array->lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
for (uint i = 0; i < lwe_array->num_radix_blocks; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + 1;
@@ -96,9 +96,10 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
}
template <typename Torus>
__global__ void device_integer_radix_scalar_subtraction_inplace(
Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
uint32_t lwe_dimension, uint64_t delta) {
__global__ void
device_scalar_subtraction_inplace(Torus *lwe_array, Torus *scalar_input,
int32_t num_blocks, uint32_t lwe_dimension,
uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
@@ -110,7 +111,7 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(
}
template <typename Torus>
__host__ void host_integer_radix_scalar_subtraction_inplace(
__host__ void host_scalar_subtraction_inplace(
CudaStreams streams, Torus *lwe_array, Torus *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
@@ -128,7 +129,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_subtraction_inplace<Torus>
device_scalar_subtraction_inplace<Torus>
<<<grid, thds, 0, streams.stream(0)>>>(lwe_array, scalar_input,
input_lwe_ciphertext_count,
lwe_dimension, delta);

View File

@@ -1,12 +1,12 @@
#include "integer/scalar_bitops.cuh"
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
void cuda_scalar_bitop_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
host_integer_radix_scalar_bitop_kb<uint64_t>(
host_scalar_bitop<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_input,
static_cast<const uint64_t *>(clear_blocks),
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,

View File

@@ -4,11 +4,12 @@
#include "integer/bitwise_ops.cuh"
template <typename Torus>
__host__ void host_integer_radix_scalar_bitop_kb(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
__host__ void
host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
Torus const *clear_blocks, Torus const *h_clear_blocks,
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
if (output->num_radix_blocks != input->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be equal")
@@ -47,7 +48,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
auto active_streams = streams.active_gpu_subset(num_clear_blocks);
lut->broadcast_lut(active_streams, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, output, input, bsks, ksks, lut, num_clear_blocks);
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));

View File

@@ -31,7 +31,7 @@ std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare) {
return {invert_operands, invert_subtraction_result};
}
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
void cuda_scalar_comparison_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
@@ -46,7 +46,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
switch (buffer->op) {
case EQ:
case NE:
host_integer_radix_scalar_equality_check_kb<uint64_t>(
host_scalar_equality_check<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
@@ -58,7 +58,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
if (num_radix_blocks % 2 != 0 && num_radix_blocks != 1)
PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
"to be even or equal to 1.")
host_integer_radix_scalar_difference_check_kb<uint64_t>(
host_scalar_difference_check<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks),
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
@@ -70,7 +70,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
if (lwe_array_in->num_radix_blocks % 2 != 0)
PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
"even.")
host_integer_radix_scalar_maxmin_kb<uint64_t>(
host_scalar_maxmin<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks),
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,

View File

@@ -25,11 +25,13 @@ Torus is_x_less_than_y_given_input_borrow(Torus last_x_block,
}
template <typename Torus>
__host__ void scalar_compare_radix_blocks_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
__host__ void scalar_compare_radix_blocks(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_array_in,
Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
return;
@@ -62,13 +64,13 @@ __host__ void scalar_compare_radix_blocks_kb(
subtracted_blocks, lwe_array_in);
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_integer_radix_scalar_subtraction_inplace<Torus>(
host_scalar_subtraction_inplace<Torus>(
streams, (Torus *)subtracted_blocks->ptr, scalar_blocks,
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, subtracted_blocks, bsks, ksks, sign_lut,
num_radix_blocks);
@@ -78,12 +80,12 @@ __host__ void scalar_compare_radix_blocks_kb(
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, lwe_array_out, message_modulus, carry_modulus);
host_add_scalar_one_inplace<Torus>(streams, lwe_array_out, message_modulus,
carry_modulus);
}
template <typename Torus>
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
__host__ void integer_radix_unsigned_scalar_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
@@ -148,7 +150,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto active_streams = streams.active_gpu_subset(1);
lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
} else if (num_scalar_blocks < num_radix_blocks) {
@@ -199,7 +201,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb<Torus>(
scalar_compare_radix_blocks<Torus>(
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
@@ -242,7 +244,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto active_streams = streams.active_gpu_subset(1);
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
ksks, lut, 1, lut->params.message_modulus);
@@ -276,7 +278,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto active_streams = streams.active_gpu_subset(1);
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
one_block_lut->release(streams);
delete one_block_lut;
@@ -305,7 +307,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_lwe_array_out;
scalar_compare_radix_blocks_kb<Torus>(
scalar_compare_radix_blocks<Torus>(
streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
@@ -321,7 +323,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
}
template <typename Torus>
__host__ void integer_radix_signed_scalar_difference_check_kb(
__host__ void integer_radix_signed_scalar_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
@@ -420,7 +422,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto active_streams = streams.active_gpu_subset(1);
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
1, lut->params.message_modulus);
@@ -466,7 +468,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb<Torus>(
scalar_compare_radix_blocks<Torus>(
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
@@ -525,7 +527,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
CudaRadixCiphertextFFI sign_block;
as_radix_ciphertext_slice<Torus>(
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
ksks, signed_msb_lut, 1, signed_msb_lut->params.message_modulus);
lsb_streams.synchronize();
@@ -568,7 +570,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto active_streams = streams.active_gpu_subset(1);
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
one_block_lut->release(streams);
delete one_block_lut;
@@ -606,7 +608,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb<Torus>(
scalar_compare_radix_blocks<Torus>(
lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
CudaRadixCiphertextFFI encrypted_sign_block;
@@ -622,7 +624,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
scalar_sign_block, h_scalar_sign_block, 1, message_modulus,
carry_modulus);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
trivial_sign_block, bsks, ksks, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
@@ -639,7 +641,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
}
template <typename Torus>
__host__ void host_integer_radix_scalar_difference_check_kb(
__host__ void host_scalar_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
@@ -654,12 +656,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
if (mem_ptr->is_signed) {
// is signed and scalar is positive
integer_radix_signed_scalar_difference_check_kb<Torus>(
integer_radix_signed_scalar_difference_check<Torus>(
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
integer_radix_unsigned_scalar_difference_check<Torus>(
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
num_scalar_blocks);
@@ -667,12 +669,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
}
template <typename Torus>
__host__ void host_integer_radix_scalar_maxmin_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
__host__ void
host_scalar_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
Torus const *scalar_blocks, Torus const *h_scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -688,7 +691,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
host_integer_radix_scalar_difference_check_kb<Torus>(
host_scalar_difference_check<Torus>(
streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks, num_scalar_blocks);
@@ -704,13 +707,13 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
host_cmux<Torus>(streams, lwe_array_out, mem_ptr->tmp_lwe_array_out,
lwe_array_left, lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_equality_check_kb(
__host__ void host_scalar_equality_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
@@ -785,7 +788,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
// We use false cause we only will broadcast the indexes
scalar_comparison_luts->broadcast_lut(active_streams, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
bsks, ksks, scalar_comparison_luts, num_halved_lsb_radix_blocks);
}

View File

@@ -1,6 +1,6 @@
#include "scalar_div.cuh"
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -21,7 +21,7 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
scalar_divisor_ffi, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_radix_kb_64(
void cuda_integer_unsigned_scalar_div_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
@@ -32,8 +32,8 @@ void cuda_integer_unsigned_scalar_div_radix_kb_64(
scalar_divisor_ffi);
}
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
@@ -44,7 +44,7 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -59,25 +59,25 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
return scratch_integer_signed_scalar_div_radix<uint64_t>(
CudaStreams(streams), params,
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_radix_kb_64(
void cuda_integer_signed_scalar_div_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
host_integer_signed_scalar_div_radix_kb<uint64_t>(
host_integer_signed_scalar_div_radix<uint64_t>(
CudaStreams(streams), numerator_ct,
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
scalar_divisor_ffi, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
@@ -88,7 +88,7 @@ void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -110,7 +110,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void cuda_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -127,7 +127,7 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
(uint64_t *)h_clear_blocks, num_clear_blocks);
}
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
@@ -139,7 +139,7 @@ void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
*mem_ptr_void = nullptr;
}
uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -161,7 +161,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
void cuda_integer_signed_scalar_div_rem_radix_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -176,7 +176,7 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
decomposed_divisor, num_scalars_divisor, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =

View File

@@ -35,7 +35,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
}
if (scalar_divisor_ffi->is_divisor_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
@@ -63,15 +63,15 @@ __host__ void host_integer_unsigned_scalar_div_radix(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_cpy, numerator_ct);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks, bsks,
scalar_divisor_ffi);
host_scalar_mul_high<Torus>(streams, numerator_cpy,
mem_ptr->scalar_mul_high_mem, ksks, bsks,
scalar_divisor_ffi);
host_sub_and_propagate_single_carry<Torus>(
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, numerator_ct->num_radix_blocks);
@@ -79,7 +79,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
@@ -87,23 +87,23 @@ __host__ void host_integer_unsigned_scalar_div_radix(
return;
}
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_pre,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
host_integer_radix_scalar_mul_high_kb<Torus>(streams, numerator_ct,
mem_ptr->scalar_mul_high_mem,
ksks, bsks, scalar_divisor_ffi);
host_scalar_mul_high<Torus>(streams, numerator_ct,
mem_ptr->scalar_mul_high_mem, ksks, bsks,
scalar_divisor_ffi);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_post,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
__host__ uint64_t scratch_integer_signed_scalar_div_radix(
CudaStreams streams, int_radix_params params,
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -119,7 +119,7 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
}
template <typename Torus>
__host__ void host_integer_signed_scalar_div_radix_kb(
__host__ void host_integer_signed_scalar_div_radix(
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
@@ -129,7 +129,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
if (scalar_divisor_ffi->is_divisor_negative) {
CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
host_integer_radix_negation<Torus>(
host_negation<Torus>(
streams, tmp, numerator_ct, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
@@ -152,11 +152,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
tmp, numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, tmp,
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
@@ -165,7 +165,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
@@ -173,11 +173,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
tmp, numerator_ct);
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
bsks);
host_signed_scalar_mul_high<Torus>(streams, tmp,
mem_ptr->scalar_mul_high_mem, ksks,
scalar_divisor_ffi, bsks);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
@@ -185,7 +185,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
xsign, numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
@@ -198,15 +198,15 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
tmp, numerator_ct);
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
bsks);
host_signed_scalar_mul_high<Torus>(streams, tmp,
mem_ptr->scalar_mul_high_mem, ksks,
scalar_divisor_ffi, bsks);
host_add_and_propagate_single_carry<Torus>(
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
@@ -214,7 +214,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
xsign, numerator_ct);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
host_arithmetic_scalar_shift_inplace<Torus>(
streams, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
@@ -224,7 +224,7 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
}
if (scalar_divisor_ffi->is_divisor_negative) {
host_integer_radix_negation<Torus>(
host_negation<Torus>(
streams, numerator_ct, tmp, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus, numerator_ct->num_radix_blocks);
} else {
@@ -270,9 +270,9 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);
host_integer_radix_scalar_bitop_kb(
streams, remainder_ct, remainder_ct, clear_blocks, h_clear_blocks,
num_clear_blocks, mem_ptr->bitop_mem, bsks, ksks);
host_scalar_bitop(streams, remainder_ct, remainder_ct, clear_blocks,
h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
bsks, ksks);
} else {
if (!scalar_divisor_ffi->is_divisor_zero) {
@@ -328,9 +328,9 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, quotient_ct);
host_integer_signed_scalar_div_radix_kb(streams, quotient_ct,
mem_ptr->signed_div_mem, bsks, ksks,
scalar_divisor_ffi, numerator_bits);
host_integer_signed_scalar_div_radix(streams, quotient_ct,
mem_ptr->signed_div_mem, bsks, ksks,
scalar_divisor_ffi, numerator_bits);
host_propagate_single_carry<Torus>(streams, quotient_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE,
@@ -341,10 +341,10 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, quotient_ct);
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
remainder_ct->num_radix_blocks);
host_logical_scalar_shift_inplace(streams, remainder_ct,
scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks,
ksks, remainder_ct->num_radix_blocks);
} else if (!scalar_divisor_ffi->is_divisor_zero) {
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),

View File

@@ -1,6 +1,6 @@
#include "integer/scalar_mul.cuh"
uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint64_t scratch_cuda_integer_scalar_mul_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
@@ -14,12 +14,12 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
return scratch_cuda_scalar_mul<uint64_t>(
CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
num_blocks, params, num_scalar_bits, allocate_gpu_memory);
}
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void cuda_scalar_multiplication_ciphertext_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
int8_t *mem, void *const *bsks, void *const *ksks, uint32_t polynomial_size,
@@ -31,8 +31,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
(uint64_t **)(ksks), message_modulus, num_scalars);
}
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_scalar_mul_buffer<uint64_t> *mem_ptr =
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -30,10 +30,12 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
}
template <typename T>
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
CudaStreams streams, int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
__host__ uint64_t scratch_cuda_scalar_mul(CudaStreams streams,
int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks,
int_radix_params params,
uint32_t num_scalar_bits,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_buffer<T>(streams, params, num_radix_blocks,
@@ -67,9 +69,9 @@ __host__ void host_integer_scalar_mul_radix(
copy_radix_ciphertext_slice_async<T>(
streams.stream(0), streams.gpu_index(0), &shift_input, 0,
num_radix_blocks, lwe_array, 0, num_radix_blocks);
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
bsks, ksks, num_radix_blocks);
host_logical_scalar_shift_inplace<T>(streams, &shift_input, shift_amount,
mem->logical_scalar_shift_buffer,
bsks, ksks, num_radix_blocks);
} else {
// create trivial assign for value = 0
set_zero_radix_ciphertext_slice_async<T>(
@@ -111,7 +113,7 @@ __host__ void host_integer_scalar_mul_radix(
streams.gpu_index(0), lwe_array, 0,
num_radix_blocks);
} else {
host_integer_partial_sum_ciphertexts_vec_kb<T>(
host_integer_partial_sum_ciphertexts_vec<T>(
streams, lwe_array, all_shifted_buffer, bsks, ksks,
mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
@@ -166,10 +168,11 @@ __host__ void host_integer_small_scalar_mul_radix(
}
template <typename Torus>
__host__ void host_integer_radix_scalar_mul_high_kb(
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
__host__ void
host_scalar_mul_high(CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high_buffer<Torus> *mem_ptr,
Torus *const *ksks, void *const *bsks,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
set_zero_radix_ciphertext_slice_async<Torus>(
@@ -186,7 +189,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
tmp_ffi->num_radix_blocks != 0) {
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
tmp_ffi->num_radix_blocks);
@@ -205,7 +208,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
}
template <typename Torus>
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
__host__ void host_signed_scalar_mul_high(
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
@@ -227,7 +230,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
tmp_ffi->num_radix_blocks != 0) {
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
host_logical_scalar_shift_inplace<Torus>(
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
tmp_ffi->num_radix_blocks);

View File

@@ -1,6 +1,6 @@
#include "scalar_rotate.cuh"
uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
uint64_t scratch_cuda_scalar_rotate_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,24 +14,24 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
return scratch_cuda_scalar_rotate<uint64_t>(
CudaStreams(streams),
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
}
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t n, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
host_scalar_rotate_inplace<uint64_t>(
CudaStreams(streams), lwe_array, n,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -9,7 +9,7 @@
#include "pbs/programmable_bootstrap_multibit.cuh"
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
__host__ uint64_t scratch_cuda_scalar_rotate(
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -22,10 +22,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
}
template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks) {
__host__ void
host_scalar_rotate_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem,
void *const *bsks, Torus *const *ksks) {
auto num_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
@@ -68,7 +69,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
@@ -92,7 +93,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
}

View File

@@ -1,6 +1,6 @@
#include "scalar_shifts.cuh"
uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
uint64_t scratch_cuda_logical_scalar_shift_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,7 +14,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
return scratch_cuda_logical_scalar_shift<uint64_t>(
CudaStreams(streams),
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
@@ -24,17 +24,19 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
/// for the left scalar shift. It is constituted of a rotation, followed by
/// the application of a PBS onto the rotated blocks up to num_blocks -
/// rotations - 1 The remaining blocks are padded with zeros
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t shift, int8_t *mem_ptr,
void *const *bsks,
void *const *ksks) {
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
host_logical_scalar_shift_inplace<uint64_t>(
CudaStreams(streams), lwe_array, shift,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), lwe_array->num_radix_blocks);
}
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
uint64_t scratch_cuda_arithmetic_scalar_shift_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -48,7 +50,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
return scratch_cuda_arithmetic_scalar_shift<uint64_t>(
CudaStreams(streams),
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, allocate_gpu_memory);
@@ -61,18 +63,20 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
/// sign block, and a second PBS is also applied to it to compute the padding
/// block, which is copied onto all remaining blocks instead of padding with
/// zeros as would be done in the logical shift.
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
uint32_t shift, int8_t *mem_ptr,
void *const *bsks,
void *const *ksks) {
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
host_arithmetic_scalar_shift_inplace<uint64_t>(
CudaStreams(streams), lwe_array, shift,
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
@@ -82,8 +86,8 @@ void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
*mem_ptr_void = nullptr;
}
void cleanup_cuda_integer_radix_arithmetic_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -10,7 +10,7 @@
#include "pbs/programmable_bootstrap_multibit.cuh"
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
__host__ uint64_t scratch_cuda_logical_scalar_shift(
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -23,7 +23,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
}
template <typename Torus>
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
__host__ void host_logical_scalar_shift_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks, uint32_t num_blocks) {
@@ -75,7 +75,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &partial_current_blocks, &partial_current_blocks,
&partial_previous_blocks, bsks, ksks, lut_bivariate,
partial_block_count, lut_bivariate->params.message_modulus);
@@ -106,7 +106,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, partial_current_blocks, partial_current_blocks,
&partial_next_blocks, bsks, ksks, lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
@@ -114,7 +114,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
__host__ uint64_t scratch_cuda_arithmetic_scalar_shift(
CudaStreams streams, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
@@ -127,7 +127,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
}
template <typename Torus>
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
__host__ void host_arithmetic_scalar_shift_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks) {
@@ -197,7 +197,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
size_t partial_block_count = num_blocks - rotations;
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, partial_current_blocks, partial_current_blocks,
&partial_next_blocks, bsks, ksks, lut_bivariate,
partial_block_count, lut_bivariate->params.message_modulus);
@@ -207,7 +207,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
streams.synchronize();
auto lut_univariate_padding_block =
mem->lut_buffers_univariate[num_bits_in_block - 1];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
lut_univariate_padding_block, 1);
// Replace blocks 'pulled' from the left with the correct padding
@@ -221,7 +221,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
if (shift_within_block != 0) {
auto lut_univariate_shift_last_block =
mem->lut_buffers_univariate[shift_within_block - 1];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
lut_univariate_shift_last_block, 1);
}

View File

@@ -1,6 +1,6 @@
#include "shift_and_rotate.cuh"
uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
uint64_t scratch_cuda_shift_and_rotate_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -14,24 +14,25 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
return scratch_cuda_shift_and_rotate<uint64_t>(
CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
num_blocks, params, shift_type, is_signed, allocate_gpu_memory);
}
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
host_shift_and_rotate_inplace<uint64_t>(
CudaStreams(streams), lwe_array, lwe_shift,
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);

View File

@@ -11,7 +11,7 @@
#include "scalar_mul.cuh"
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
__host__ uint64_t scratch_cuda_shift_and_rotate(
CudaStreams streams, int_shift_and_rotate_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
@@ -23,11 +23,12 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
}
template <typename Torus>
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks) {
__host__ void
host_shift_and_rotate_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem,
void *const *bsks, Torus *const *ksks) {
cuda_set_device(streams.gpu_index(0));
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
@@ -158,7 +159,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// we have
// control_bit|b|a
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, input_bits_a, mux_inputs, bsks, ksks, mux_lut, total_nb_bits);
}
@@ -190,7 +191,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// To give back a clean ciphertext
auto cleaning_lut = mem->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, lwe_array, lwe_array, bsks, ksks, cleaning_lut,
num_radix_blocks);
}

View File

@@ -1,6 +1,6 @@
#include "subtraction.cuh"
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -19,7 +19,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
num_blocks, params, requested_flag, allocate_gpu_memory);
}
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
void cuda_sub_and_propagate_single_carry_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,

View File

@@ -36,9 +36,9 @@ void host_sub_and_propagate_single_carry(
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
uint32_t requested_flag, uint32_t uses_carry) {
host_integer_radix_negation<Torus>(
streams, mem->neg_rhs_array, rhs_array, mem->params.message_modulus,
mem->params.carry_modulus, mem->neg_rhs_array->num_radix_blocks);
host_negation<Torus>(streams, mem->neg_rhs_array, rhs_array,
mem->params.message_modulus, mem->params.carry_modulus,
mem->neg_rhs_array->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
@@ -46,11 +46,12 @@ void host_sub_and_propagate_single_carry(
}
template <typename Torus>
__host__ void host_integer_radix_subtraction(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2, uint64_t message_modulus,
uint64_t carry_modulus, uint32_t num_radix_blocks) {
__host__ void host_subtraction(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_1,
CudaRadixCiphertextFFI const *lwe_array_in_2,
uint64_t message_modulus, uint64_t carry_modulus,
uint32_t num_radix_blocks) {
cuda_set_device(streams.gpu_index(0));
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
@@ -64,16 +65,15 @@ __host__ void host_integer_radix_subtraction(
PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
"the same")
host_integer_radix_negation<Torus>(streams, lwe_array_out, lwe_array_in_2,
message_modulus, carry_modulus,
num_radix_blocks);
host_negation<Torus>(streams, lwe_array_out, lwe_array_in_2, message_modulus,
carry_modulus, num_radix_blocks);
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), lwe_array_out,
lwe_array_out, lwe_array_in_1, num_radix_blocks,
message_modulus, carry_modulus);
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
__host__ uint64_t scratch_cuda_integer_overflowing_sub(
CudaStreams streams, int_overflowing_sub_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {

View File

@@ -94,7 +94,7 @@ __host__ void host_expand_without_verification(
into_radix_ciphertext(output, lwe_array_out, 2 * num_lwes, lwe_dimension);
auto input = new CudaRadixCiphertextFFI;
into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
integer_radix_apply_univariate_lookup_table<Torus>(
streams, output, input, bsks, ksks, message_and_carry_extract_luts,
2 * num_lwes);
}

View File

@@ -302,7 +302,7 @@ const _: () = {
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
};
unsafe extern "C" {
pub fn scratch_cuda_apply_univariate_lut_kb_64(
pub fn scratch_cuda_apply_univariate_lut_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
input_lut: *const ffi::c_void,
@@ -324,7 +324,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
pub fn scratch_cuda_apply_many_univariate_lut_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
input_lut: *const ffi::c_void,
@@ -347,7 +347,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_apply_univariate_lut_kb_64(
pub fn cuda_apply_univariate_lut_64(
streams: CudaStreamsFFI,
output_radix_lwe: *mut CudaRadixCiphertextFFI,
input_radix_lwe: *const CudaRadixCiphertextFFI,
@@ -357,13 +357,13 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_apply_univariate_lut_kb_64(
pub fn cleanup_cuda_apply_univariate_lut_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_apply_bivariate_lut_kb_64(
pub fn scratch_cuda_apply_bivariate_lut_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
input_lut: *const ffi::c_void,
@@ -385,7 +385,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_apply_bivariate_lut_kb_64(
pub fn cuda_apply_bivariate_lut_64(
streams: CudaStreamsFFI,
output_radix_lwe: *mut CudaRadixCiphertextFFI,
input_radix_lwe_1: *const CudaRadixCiphertextFFI,
@@ -398,13 +398,10 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
pub fn cleanup_cuda_apply_bivariate_lut_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn cuda_apply_many_univariate_lut_kb_64(
pub fn cuda_apply_many_univariate_lut_64(
streams: CudaStreamsFFI,
output_radix_lwe: *mut CudaRadixCiphertextFFI,
input_radix_lwe: *const CudaRadixCiphertextFFI,
@@ -448,7 +445,7 @@ unsafe extern "C" {
pub fn cleanup_cuda_full_propagation(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
pub fn scratch_cuda_integer_mult_radix_ciphertext_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
is_boolean_left: bool,
@@ -470,7 +467,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_mult_radix_ciphertext_kb_64(
pub fn cuda_integer_mult_radix_ciphertext_64(
streams: CudaStreamsFFI,
radix_lwe_out: *mut CudaRadixCiphertextFFI,
radix_lwe_left: *const CudaRadixCiphertextFFI,
@@ -488,7 +485,7 @@ unsafe extern "C" {
pub fn cleanup_cuda_integer_mult(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn cuda_negate_integer_radix_ciphertext_64(
pub fn cuda_negate_ciphertext_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -498,7 +495,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
pub fn cuda_scalar_addition_ciphertext_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
scalar_input: *const ffi::c_void,
@@ -509,7 +506,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
pub fn scratch_cuda_logical_scalar_shift_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -531,7 +528,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
pub fn cuda_logical_scalar_shift_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
shift: u32,
@@ -541,7 +538,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
pub fn scratch_cuda_arithmetic_scalar_shift_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -563,7 +560,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
pub fn cuda_arithmetic_scalar_shift_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
shift: u32,
@@ -573,19 +570,16 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_logical_scalar_shift(
pub fn cleanup_cuda_logical_scalar_shift(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn cleanup_cuda_arithmetic_scalar_shift(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_arithmetic_scalar_shift(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_shift_and_rotate_kb_64(
pub fn scratch_cuda_shift_and_rotate_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -608,7 +602,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_radix_shift_and_rotate_kb_64_inplace(
pub fn cuda_shift_and_rotate_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
lwe_shift: *const CudaRadixCiphertextFFI,
@@ -618,13 +612,10 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_shift_and_rotate(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
pub fn cleanup_cuda_shift_and_rotate(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_comparison_kb_64(
pub fn scratch_cuda_comparison_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -647,7 +638,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
pub fn cuda_comparison_ciphertext_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_1: *const CudaRadixCiphertextFFI,
@@ -658,7 +649,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
pub fn cuda_scalar_comparison_ciphertext_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -674,7 +665,7 @@ unsafe extern "C" {
pub fn cleanup_cuda_integer_comparison(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_bitop_kb_64(
pub fn scratch_cuda_bitop_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -696,7 +687,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
pub fn cuda_bitop_ciphertext_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_1: *const CudaRadixCiphertextFFI,
@@ -707,7 +698,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
pub fn cuda_scalar_bitop_ciphertext_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_input: *const CudaRadixCiphertextFFI,
@@ -723,7 +714,7 @@ unsafe extern "C" {
pub fn cleanup_cuda_integer_bitop(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_cmux_kb_64(
pub fn scratch_cuda_cmux_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -744,7 +735,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_cmux_integer_radix_ciphertext_kb_64(
pub fn cuda_cmux_ciphertext_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_condition: *const CudaRadixCiphertextFFI,
@@ -756,10 +747,10 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_cmux(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
pub fn cleanup_cuda_cmux(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_scalar_rotate_kb_64(
pub fn scratch_cuda_scalar_rotate_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -781,7 +772,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_radix_scalar_rotate_kb_64_inplace(
pub fn cuda_scalar_rotate_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
n: u32,
@@ -791,13 +782,10 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_scalar_rotate(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
pub fn cleanup_cuda_scalar_rotate(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_propagate_single_carry_kb_64_inplace(
pub fn scratch_cuda_propagate_single_carry_64_inplace(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -819,7 +807,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
pub fn scratch_cuda_add_and_propagate_single_carry_64_inplace(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -841,7 +829,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_propagate_single_carry_kb_64_inplace(
pub fn cuda_propagate_single_carry_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
carry_out: *mut CudaRadixCiphertextFFI,
@@ -854,7 +842,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cuda_add_and_propagate_single_carry_kb_64_inplace(
pub fn cuda_add_and_propagate_single_carry_64_inplace(
streams: CudaStreamsFFI,
lhs_array: *mut CudaRadixCiphertextFFI,
rhs_array: *const CudaRadixCiphertextFFI,
@@ -877,7 +865,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_overflowing_sub_kb_64_inplace(
pub fn scratch_cuda_integer_overflowing_sub_64_inplace(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -899,7 +887,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_overflowing_sub_kb_64_inplace(
pub fn cuda_integer_overflowing_sub_64_inplace(
streams: CudaStreamsFFI,
lhs_array: *mut CudaRadixCiphertextFFI,
rhs_array: *const CudaRadixCiphertextFFI,
@@ -919,7 +907,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
pub fn scratch_cuda_partial_sum_ciphertexts_vec_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -941,7 +929,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
pub fn cuda_partial_sum_ciphertexts_vec_64(
streams: CudaStreamsFFI,
radix_lwe_out: *mut CudaRadixCiphertextFFI,
radix_lwe_vec: *mut CudaRadixCiphertextFFI,
@@ -951,13 +939,13 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
pub fn cleanup_cuda_partial_sum_ciphertexts_vec(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_scalar_mul_kb_64(
pub fn scratch_cuda_integer_scalar_mul_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -978,7 +966,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
pub fn cuda_scalar_multiplication_ciphertext_64_inplace(
streams: CudaStreamsFFI,
lwe_array: *mut CudaRadixCiphertextFFI,
decomposed_scalar: *const u64,
@@ -992,13 +980,10 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_radix_scalar_mul(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
pub fn cleanup_cuda_scalar_mul(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
pub fn scratch_cuda_integer_div_rem_radix_ciphertext_64(
streams: CudaStreamsFFI,
is_signed: bool,
mem_ptr: *mut *mut i8,
@@ -1020,7 +1005,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
pub fn cuda_integer_div_rem_radix_ciphertext_64(
streams: CudaStreamsFFI,
quotient: *mut CudaRadixCiphertextFFI,
remainder: *mut CudaRadixCiphertextFFI,
@@ -1081,7 +1066,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
pub fn scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
is_signed: bool,
@@ -1103,7 +1088,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_abs_inplace_radix_ciphertext_kb_64(
pub fn cuda_integer_abs_inplace_radix_ciphertext_64(
streams: CudaStreamsFFI,
ct: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
@@ -1116,7 +1101,7 @@ unsafe extern "C" {
pub fn cleanup_cuda_integer_abs_inplace(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
pub fn scratch_cuda_integer_are_all_comparisons_block_true_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1137,7 +1122,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
pub fn cuda_integer_are_all_comparisons_block_true_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -1154,7 +1139,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1175,7 +1160,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
pub fn cuda_integer_is_at_least_one_comparisons_block_true_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_in: *const CudaRadixCiphertextFFI,
@@ -1206,7 +1191,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_apply_noise_squashing_kb(
pub fn scratch_cuda_apply_noise_squashing(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
lwe_dimension: u32,
@@ -1229,7 +1214,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_apply_noise_squashing_kb(
pub fn cuda_apply_noise_squashing(
streams: CudaStreamsFFI,
output_radix_lwe: *mut CudaRadixCiphertextFFI,
input_radix_lwe: *const CudaRadixCiphertextFFI,
@@ -1239,13 +1224,10 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_apply_noise_squashing_kb(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
pub fn cleanup_cuda_apply_noise_squashing(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
pub fn scratch_cuda_sub_and_propagate_single_carry_64_inplace(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1267,7 +1249,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_sub_and_propagate_single_carry_kb_64_inplace(
pub fn cuda_sub_and_propagate_single_carry_64_inplace(
streams: CudaStreamsFFI,
lhs_array: *mut CudaRadixCiphertextFFI,
rhs_array: *const CudaRadixCiphertextFFI,
@@ -1287,7 +1269,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
pub fn scratch_cuda_integer_unsigned_scalar_div_radix_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1308,7 +1290,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_unsigned_scalar_div_radix_kb_64(
pub fn cuda_integer_unsigned_scalar_div_radix_64(
streams: CudaStreamsFFI,
numerator_ct: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
@@ -1318,7 +1300,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
pub fn cleanup_cuda_integer_unsigned_scalar_div_radix_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
@@ -1362,7 +1344,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_signed_scalar_div_radix_kb_64(
pub fn scratch_cuda_integer_signed_scalar_div_radix_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1383,7 +1365,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_signed_scalar_div_radix_kb_64(
pub fn cuda_integer_signed_scalar_div_radix_64(
streams: CudaStreamsFFI,
numerator_ct: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
@@ -1394,13 +1376,13 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
pub fn cleanup_cuda_integer_signed_scalar_div_radix_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
pub fn scratch_integer_unsigned_scalar_div_rem_radix_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1422,7 +1404,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
pub fn cuda_integer_unsigned_scalar_div_rem_radix_64(
streams: CudaStreamsFFI,
quotient_ct: *mut CudaRadixCiphertextFFI,
remainder_ct: *mut CudaRadixCiphertextFFI,
@@ -1439,13 +1421,13 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_integer_signed_scalar_div_rem_radix_kb_64(
pub fn scratch_integer_signed_scalar_div_rem_radix_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1467,7 +1449,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_signed_scalar_div_rem_radix_kb_64(
pub fn cuda_integer_signed_scalar_div_rem_radix_64(
streams: CudaStreamsFFI,
quotient_ct: *mut CudaRadixCiphertextFFI,
remainder_ct: *mut CudaRadixCiphertextFFI,
@@ -1482,13 +1464,13 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_integer_count_of_consecutive_bits_kb_64(
pub fn scratch_integer_count_of_consecutive_bits_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1511,7 +1493,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_count_of_consecutive_bits_kb_64(
pub fn cuda_integer_count_of_consecutive_bits_64(
streams: CudaStreamsFFI,
output_ct: *mut CudaRadixCiphertextFFI,
input_ct: *const CudaRadixCiphertextFFI,
@@ -1521,7 +1503,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
pub fn cleanup_cuda_integer_count_of_consecutive_bits_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
@@ -1549,7 +1531,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_grouped_oprf_async_64(
pub fn cuda_integer_grouped_oprf_64(
streams: CudaStreamsFFI,
radix_lwe_out: *mut CudaRadixCiphertextFFI,
seeded_lwe_input: *const ffi::c_void,
@@ -1565,7 +1547,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn scratch_integer_ilog2_kb_64(
pub fn scratch_integer_ilog2_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
@@ -1587,7 +1569,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_ilog2_kb_64(
pub fn cuda_integer_ilog2_64(
streams: CudaStreamsFFI,
output_ct: *mut CudaRadixCiphertextFFI,
input_ct: *const CudaRadixCiphertextFFI,
@@ -1600,7 +1582,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_ilog2_kb_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(

View File

@@ -11,7 +11,7 @@ use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextIn
use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaVec, KsType, LweDimension};
use crate::integer::gpu::key_switching_key::CudaKeySwitchingKey;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{expand_async, PBSType};
use crate::integer::gpu::{cuda_backend_expand, PBSType};
use crate::shortint::ciphertext::CompactCiphertextList;
use crate::shortint::parameters::{
CompactCiphertextListExpansionKind, Degree, LweBskGroupingFactor, NoiseLevel,
@@ -409,7 +409,7 @@ impl CudaFlattenedVecCompactCiphertextList {
match &sks.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
expand_async(
cuda_backend_expand(
streams,
&mut d_output,
d_input,
@@ -444,7 +444,7 @@ impl CudaFlattenedVecCompactCiphertextList {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
expand_async(
cuda_backend_expand(
streams,
&mut d_output,
d_input,

View File

@@ -12,7 +12,7 @@ use crate::integer::gpu::ciphertext::squashed_noise::{
CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
CudaSquashedNoiseSignedRadixCiphertext,
};
use crate::integer::gpu::decompress_integer_radix_async_128;
use crate::integer::gpu::cuda_backend_decompress_128;
use crate::integer::gpu::list_compression::server_keys::{
CudaNoiseSquashingCompressionKey, CudaPackedGlweCiphertextList,
};
@@ -345,7 +345,7 @@ impl CudaCompressedSquashedNoiseCiphertextList {
);
unsafe {
decompress_integer_radix_async_128(
cuda_backend_decompress_128(
streams,
&mut output_lwe,
&self.packed_list,

View File

@@ -15,8 +15,8 @@ use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphe
use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async_64,
get_compression_size_on_gpu, get_decompression_size_on_gpu,
cuda_backend_compress, cuda_backend_decompress, cuda_backend_get_compression_size_on_gpu,
cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu,
};
use crate::prelude::CastInto;
use crate::shortint::ciphertext::{
@@ -322,7 +322,7 @@ impl CudaCompressionKey {
unsafe {
let input_lwes = Self::flatten_async(ciphertexts, streams);
compress_integer_radix_async(
cuda_backend_compress(
streams,
&mut glwe_array_out,
&input_lwes,
@@ -355,7 +355,7 @@ impl CudaCompressionKey {
let compressed_polynomial_size = lwe_pksk.output_polynomial_size();
let compressed_glwe_size = lwe_pksk.output_glwe_size();
get_compression_size_on_gpu(
cuda_backend_get_compression_size_on_gpu(
streams,
message_modulus,
carry_modulus,
@@ -430,7 +430,7 @@ impl CudaDecompressionKey {
);
unsafe {
decompress_integer_radix_async_64(
cuda_backend_decompress(
streams,
&mut output_lwe,
packed_list,
@@ -515,7 +515,7 @@ impl CudaDecompressionKey {
);
let lwe_dimension = bsk.output_lwe_dimension();
get_decompression_size_on_gpu(
cuda_backend_get_decompression_size_on_gpu(
streams,
message_modulus,
carry_modulus,
@@ -570,7 +570,7 @@ impl CudaDecompressionKey {
);
let lwe_dimension = bsk.output_lwe_dimension();
get_decompression_size_on_gpu(
cuda_backend_get_decompression_size_on_gpu(
streams,
message_modulus,
carry_modulus,
@@ -712,7 +712,7 @@ impl CudaNoiseSquashingCompressionKey {
unsafe {
let input_lwes = Self::flatten_async(ciphertexts, streams);
compress_integer_radix_async(
cuda_backend_compress(
streams,
&mut glwe_array_out,
&input_lwes,

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,7 @@ use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{unchecked_signed_abs_radix_kb_assign_async, PBSType};
use crate::integer::gpu::{cuda_backend_unchecked_signed_abs_assign, PBSType};
impl CudaServerKey {
/// # Safety
@@ -18,7 +18,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_signed_abs_radix_kb_assign_async(
cuda_backend_unchecked_signed_abs_assign(
streams,
ct.as_mut(),
&d_bsk.d_vec,
@@ -44,7 +44,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_signed_abs_radix_kb_assign_async(
cuda_backend_unchecked_signed_abs_assign(
streams,
ct.as_mut(),
&d_multibit_bsk.d_vec,

View File

@@ -7,10 +7,10 @@ use crate::integer::gpu::ciphertext::{
};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
add_and_propagate_single_carry_assign_async,
get_add_and_propagate_single_carry_assign_async_size_on_gpu,
get_full_propagate_assign_size_on_gpu, unchecked_add_integer_radix_assign_async,
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async, PBSType,
cuda_backend_add_and_propagate_single_carry_assign,
cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu,
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_unchecked_add_assign,
cuda_backend_unchecked_partial_sum_ciphertexts_assign, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::ciphertext::NoiseLevel;
@@ -153,23 +153,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count().0
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -199,7 +201,7 @@ impl CudaServerKey {
let num_blocks = ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
let add_assign_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_add_and_propagate_single_carry_assign_async_size_on_gpu(
cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
@@ -218,7 +220,7 @@ impl CudaServerKey {
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_add_and_propagate_single_carry_assign_async_size_on_gpu(
cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -313,7 +315,7 @@ impl CudaServerKey {
);
unsafe {
unchecked_add_integer_radix_assign_async(streams, ciphertext_left, ciphertext_right);
cuda_backend_unchecked_add_assign(streams, ciphertext_left, ciphertext_right);
}
}
@@ -373,7 +375,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
cuda_backend_unchecked_partial_sum_ciphertexts_assign(
streams,
result.as_mut(),
&mut terms,
@@ -399,7 +401,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
cuda_backend_unchecked_partial_sum_ciphertexts_assign(
streams,
result.as_mut(),
&mut terms,
@@ -833,7 +835,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
add_and_propagate_single_carry_assign_async(
cuda_backend_add_and_propagate_single_carry_assign(
streams,
lhs.as_mut(),
rhs.as_ref(),
@@ -859,7 +861,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
add_and_propagate_single_carry_assign_async(
cuda_backend_add_and_propagate_single_carry_assign(
streams,
lhs.as_mut(),
rhs.as_ref(),

View File

@@ -6,9 +6,9 @@ use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::{
get_aes_ctr_encrypt_integer_radix_size_on_gpu, get_key_expansion_integer_radix_size_on_gpu,
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async,
unchecked_key_expansion_integer_radix_kb_assign_async, PBSType,
cuda_backend_aes_key_expansion, cuda_backend_get_aes_ctr_encrypt_size_on_gpu,
cuda_backend_get_aes_key_expansion_size_on_gpu, cuda_backend_unchecked_aes_ctr_encrypt,
PBSType,
};
use crate::integer::{RadixCiphertext, RadixClientKey};
use crate::shortint::Ciphertext;
@@ -231,7 +231,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
cuda_backend_unchecked_aes_ctr_encrypt(
streams,
result.as_mut(),
iv.as_ref(),
@@ -256,7 +256,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
cuda_backend_unchecked_aes_ctr_encrypt(
streams,
result.as_mut(),
iv.as_ref(),
@@ -308,7 +308,7 @@ impl CudaServerKey {
streams: &CudaStreams,
) -> u64 {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_aes_ctr_encrypt_integer_radix_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
streams,
num_aes_inputs as u32,
sbox_parallelism as u32,
@@ -326,7 +326,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_aes_ctr_encrypt_integer_radix_size_on_gpu(
cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
streams,
num_aes_inputs as u32,
sbox_parallelism as u32,
@@ -371,7 +371,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_key_expansion_integer_radix_kb_assign_async(
cuda_backend_aes_key_expansion(
streams,
expanded_keys.as_mut(),
key.as_ref(),
@@ -392,7 +392,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_key_expansion_integer_radix_kb_assign_async(
cuda_backend_aes_key_expansion(
streams,
expanded_keys.as_mut(),
key.as_ref(),
@@ -428,7 +428,7 @@ impl CudaServerKey {
/// synchronization is required
unsafe fn get_key_expansion_size_on_gpu_async(&self, streams: &CudaStreams) -> u64 {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_key_expansion_integer_radix_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_aes_key_expansion_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -444,7 +444,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_key_expansion_integer_radix_size_on_gpu(
cuda_backend_get_aes_key_expansion_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,

View File

@@ -9,8 +9,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_bitop_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
unchecked_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
cuda_backend_get_bitop_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_unchecked_bitop_assign, BitOpType, CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -209,7 +209,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_bitop_integer_radix_kb_assign_async(
cuda_backend_unchecked_bitop_assign(
streams,
ct_left.as_mut(),
ct_right.as_ref(),
@@ -237,7 +237,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_bitop_integer_radix_kb_assign_async(
cuda_backend_unchecked_bitop_assign(
streams,
ct_left.as_mut(),
ct_right.as_ref(),
@@ -283,23 +283,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -329,7 +331,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
let bitop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_bitop_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_bitop_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -351,30 +353,28 @@ impl CudaServerKey {
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_bitop_integer_radix_kb_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
op,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_bitop_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
op,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
),
};
actual_full_prop_mem.max(bitop_mem)
}
@@ -938,23 +938,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),

View File

@@ -4,8 +4,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_cmux_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
unchecked_cmux_integer_radix_kb_async, CudaServerKey, PBSType,
cuda_backend_get_cmux_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_unchecked_cmux, CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -27,7 +27,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_cmux_integer_radix_kb_async(
cuda_backend_unchecked_cmux(
stream,
result.as_mut(),
condition,
@@ -56,7 +56,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_cmux_integer_radix_kb_async(
cuda_backend_unchecked_cmux(
stream,
result.as_mut(),
condition,
@@ -150,23 +150,25 @@ impl CudaServerKey {
false_ct.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -196,7 +198,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = true_ct.as_ref().d_blocks.lwe_ciphertext_count();
let cmux_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_cmux_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_cmux_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -217,29 +219,27 @@ impl CudaServerKey {
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_cmux_integer_radix_kb_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_cmux_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
),
};
actual_full_prop_mem.max(cmux_mem)
}

View File

@@ -6,8 +6,8 @@ use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaRadixCiphertext};
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_comparison_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
unchecked_comparison_integer_radix_kb_async, ComparisonType, CudaServerKey, PBSType,
cuda_backend_get_comparison_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_unchecked_comparison, ComparisonType, CudaServerKey, PBSType,
};
use crate::shortint::ciphertext::Degree;
@@ -51,7 +51,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_comparison_integer_radix_kb_async(
cuda_backend_unchecked_comparison(
streams,
result.as_mut().as_mut(),
ct_left.as_ref(),
@@ -80,7 +80,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_comparison_integer_radix_kb_async(
cuda_backend_unchecked_comparison(
streams,
result.as_mut().as_mut(),
ct_left.as_ref(),
@@ -365,23 +365,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -411,7 +413,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
let comparison_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_comparison_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_comparison_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -435,7 +437,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_comparison_integer_radix_kb_size_on_gpu(
cuda_backend_get_comparison_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -1131,7 +1133,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_comparison_integer_radix_kb_async(
cuda_backend_unchecked_comparison(
streams,
result.as_mut(),
ct_left.as_ref(),
@@ -1160,7 +1162,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_comparison_integer_radix_kb_async(
cuda_backend_unchecked_comparison(
streams,
result.as_mut(),
ct_left.as_ref(),
@@ -1227,7 +1229,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_comparison_integer_radix_kb_async(
cuda_backend_unchecked_comparison(
streams,
result.as_mut(),
ct_left.as_ref(),
@@ -1256,7 +1258,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_comparison_integer_radix_kb_async(
cuda_backend_unchecked_comparison(
streams,
result.as_mut(),
ct_left.as_ref(),

View File

@@ -3,8 +3,8 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
get_div_rem_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
unchecked_div_rem_integer_radix_kb_assign_async, PBSType,
cuda_backend_get_div_rem_size_on_gpu, cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_unchecked_div_rem_assign, PBSType,
};
impl CudaServerKey {
@@ -26,7 +26,7 @@ impl CudaServerKey {
let num_blocks = divisor.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_div_rem_integer_radix_kb_assign_async(
cuda_backend_unchecked_div_rem_assign(
streams,
quotient.as_mut(),
remainder.as_mut(),
@@ -56,7 +56,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_div_rem_integer_radix_kb_assign_async(
cuda_backend_unchecked_div_rem_assign(
streams,
quotient.as_mut(),
remainder.as_mut(),
@@ -258,23 +258,25 @@ impl CudaServerKey {
divisor.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -304,7 +306,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = numerator.as_ref().d_blocks.lwe_ciphertext_count();
let mul_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_div_rem_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_div_rem_size_on_gpu(
streams,
T::IS_SIGNED,
self.message_modulus,
@@ -326,30 +328,28 @@ impl CudaServerKey {
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_div_rem_integer_radix_kb_size_on_gpu(
streams,
T::IS_SIGNED,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_div_rem_size_on_gpu(
streams,
T::IS_SIGNED,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
),
};
actual_full_prop_mem.max(mul_mem)
}

View File

@@ -5,7 +5,7 @@ use crate::integer::gpu::ciphertext::{
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{count_of_consecutive_bits_async, ilog2_async, PBSType};
use crate::integer::gpu::{cuda_backend_count_of_consecutive_bits, cuda_backend_ilog2, PBSType};
use crate::integer::server_key::radix_parallel::ilog2::{BitValue, Direction};
impl CudaServerKey {
@@ -40,7 +40,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
count_of_consecutive_bits_async(
cuda_backend_count_of_consecutive_bits(
streams,
result.as_mut(),
ct.as_ref(),
@@ -63,7 +63,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
count_of_consecutive_bits_async(
cuda_backend_count_of_consecutive_bits(
streams,
result.as_mut(),
ct.as_ref(),
@@ -279,7 +279,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
ilog2_async(
cuda_backend_ilog2(
streams,
result.as_mut(),
ct.as_ref(),
@@ -306,7 +306,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
ilog2_async(
cuda_backend_ilog2(
streams,
result.as_mut(),
ct.as_ref(),

View File

@@ -16,11 +16,12 @@ use crate::integer::gpu::ciphertext::{
use crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
apply_bivariate_lut_kb_async, apply_many_univariate_lut_kb_async,
apply_univariate_lut_kb_async, compute_prefix_sum_hillis_steele_async,
extend_radix_with_sign_msb_async, extend_radix_with_trivial_zero_blocks_msb_async,
full_propagate_assign_async, noise_squashing_async, propagate_single_carry_assign_async,
trim_radix_blocks_lsb_async, CudaServerKey, PBSType,
cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut,
cuda_backend_apply_univariate_lut, cuda_backend_compute_prefix_sum_hillis_steele,
cuda_backend_extend_radix_with_sign_msb,
cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign,
cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign,
cuda_backend_trim_radix_blocks_lsb, CudaServerKey, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::ciphertext::{Degree, NoiseLevel};
@@ -239,7 +240,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
propagate_single_carry_assign_async(
cuda_backend_propagate_single_carry_assign(
streams,
ciphertext,
carry_out.as_mut(),
@@ -264,7 +265,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
propagate_single_carry_assign_async(
cuda_backend_propagate_single_carry_assign(
streams,
ciphertext,
carry_out.as_mut(),
@@ -302,7 +303,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
full_propagate_assign_async(
cuda_backend_full_propagate_assign(
streams,
ciphertext,
&d_bsk.d_vec,
@@ -323,7 +324,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
full_propagate_assign_async(
cuda_backend_full_propagate_assign(
streams,
ciphertext,
&d_multibit_bsk.d_vec,
@@ -507,7 +508,11 @@ impl CudaServerKey {
};
unsafe {
extend_radix_with_trivial_zero_blocks_msb_async(output.as_mut(), ct.as_ref(), streams);
cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
output.as_mut(),
ct.as_ref(),
streams,
);
}
output
}
@@ -581,7 +586,7 @@ impl CudaServerKey {
unsafe { self.create_trivial_zero_radix_async(output_num_blocks, streams) };
unsafe {
trim_radix_blocks_lsb_async(output.as_mut(), ct.as_ref(), streams);
cuda_backend_trim_radix_blocks_lsb(output.as_mut(), ct.as_ref(), streams);
}
output
@@ -791,7 +796,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
apply_univariate_lut_kb_async(
cuda_backend_apply_univariate_lut(
streams,
&mut output_slice,
&mut output_degrees,
@@ -819,7 +824,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
apply_univariate_lut_kb_async(
cuda_backend_apply_univariate_lut(
streams,
&mut output_slice,
&mut output_degrees,
@@ -909,7 +914,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
apply_bivariate_lut_kb_async(
cuda_backend_apply_bivariate_lut(
streams,
&mut output_slice,
&mut output_degrees,
@@ -939,7 +944,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
apply_bivariate_lut_kb_async(
cuda_backend_apply_bivariate_lut(
streams,
&mut output_slice,
&mut output_degrees,
@@ -1088,7 +1093,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
apply_many_univariate_lut_kb_async(
cuda_backend_apply_many_univariate_lut(
streams,
&mut output_slice,
&mut output_degrees,
@@ -1118,7 +1123,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
apply_many_univariate_lut_kb_async(
cuda_backend_apply_many_univariate_lut(
streams,
&mut output_slice,
&mut output_degrees,
@@ -1229,7 +1234,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
compute_prefix_sum_hillis_steele_async(
cuda_backend_compute_prefix_sum_hillis_steele(
streams,
&mut output_slice,
&mut output_degrees,
@@ -1259,7 +1264,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
compute_prefix_sum_hillis_steele_async(
cuda_backend_compute_prefix_sum_hillis_steele(
streams,
&mut output_slice,
&mut output_degrees,
@@ -1324,7 +1329,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
extend_radix_with_sign_msb_async(
cuda_backend_extend_radix_with_sign_msb(
streams,
output.as_mut(),
ct.as_ref(),
@@ -1346,7 +1351,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
extend_radix_with_sign_msb_async(
cuda_backend_extend_radix_with_sign_msb(
streams,
output.as_mut(),
ct.as_ref(),
@@ -1638,7 +1643,7 @@ impl CudaServerKey {
unsafe {
match &d_bootstrapping_key {
CudaBootstrappingKey::Classic(bsk) => {
noise_squashing_async(
cuda_backend_noise_squashing(
streams,
&mut output_slice,
&mut output_degrees,
@@ -1667,7 +1672,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(mb_bsk) => {
noise_squashing_async(
cuda_backend_noise_squashing(
streams,
&mut output_slice,
&mut output_degrees,

View File

@@ -3,8 +3,8 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_mul_integer_radix_kb_size_on_gpu,
unchecked_mul_integer_radix_kb_assign_async, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_mul_size_on_gpu,
cuda_backend_unchecked_mul_assign, PBSType,
};
impl CudaServerKey {
@@ -80,7 +80,7 @@ impl CudaServerKey {
let is_boolean_right = ct_right.holds_boolean_value();
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_mul_integer_radix_kb_assign_async(
cuda_backend_unchecked_mul_assign(
streams,
ct_left.as_mut(),
is_boolean_left,
@@ -104,7 +104,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_mul_integer_radix_kb_assign_async(
cuda_backend_unchecked_mul_assign(
streams,
ct_left.as_mut(),
is_boolean_left,
@@ -264,23 +264,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -312,7 +314,7 @@ impl CudaServerKey {
let is_boolean_right = ct_right.holds_boolean_value();
let mul_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_mul_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_mul_size_on_gpu(
streams,
is_boolean_left,
is_boolean_right,
@@ -332,7 +334,7 @@ impl CudaServerKey {
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => get_mul_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => cuda_backend_get_mul_size_on_gpu(
streams,
is_boolean_left,
is_boolean_right,

View File

@@ -3,8 +3,8 @@ use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::{
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
};
use crate::integer::gpu::cuda_backend_unchecked_negate;
use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::gpu::unchecked_negate_integer_radix_async;
use crate::integer::server_key::radix_parallel::OutputFlag;
impl CudaServerKey {
@@ -70,7 +70,7 @@ impl CudaServerKey {
let info = ctxt.as_ref().info.blocks.first().unwrap();
unchecked_negate_integer_radix_async(
cuda_backend_unchecked_negate(
streams,
ciphertext_out.as_mut(),
ctxt.as_ref(),

View File

@@ -12,7 +12,9 @@ use crate::shortint::oprf::{create_random_from_seed_modulus_switched, raw_seeded
pub use tfhe_csprng::seeders::{Seed, Seeder};
use crate::integer::gpu::{get_grouped_oprf_size_on_gpu, grouped_oprf_async, CudaVec, PBSType};
use crate::integer::gpu::{
cuda_backend_get_grouped_oprf_size_on_gpu, cuda_backend_grouped_oprf, CudaVec, PBSType,
};
impl CudaServerKey {
/// Generates an encrypted `num_block` blocks unsigned integer
@@ -372,7 +374,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
grouped_oprf_async(
cuda_backend_grouped_oprf(
streams,
result,
&d_seeded_lwe_input,
@@ -395,7 +397,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_bsk) => {
grouped_oprf_async(
cuda_backend_grouped_oprf(
streams,
result,
&d_seeded_lwe_input,
@@ -429,7 +431,7 @@ impl CudaServerKey {
let message_bits = self.message_modulus.0.ilog2();
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_grouped_oprf_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_grouped_oprf_size_on_gpu(
streams,
1,
d_bsk.input_lwe_dimension,
@@ -447,7 +449,7 @@ impl CudaServerKey {
message_bits,
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_bsk) => get_grouped_oprf_size_on_gpu(
CudaBootstrappingKey::MultiBit(d_bsk) => cuda_backend_get_grouped_oprf_size_on_gpu(
streams,
1,
d_bsk.input_lwe_dimension,

View File

@@ -3,10 +3,9 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_rotate_left_integer_radix_kb_size_on_gpu,
get_rotate_right_integer_radix_kb_size_on_gpu,
unchecked_rotate_left_integer_radix_kb_assign_async,
unchecked_rotate_right_integer_radix_kb_assign_async, CudaServerKey, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_rotate_left_size_on_gpu,
cuda_backend_get_rotate_right_size_on_gpu, cuda_backend_unchecked_rotate_left_assign,
cuda_backend_unchecked_rotate_right_assign, CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -27,7 +26,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_rotate_right_integer_radix_kb_assign_async(
cuda_backend_unchecked_rotate_right_assign(
streams,
ct.as_mut(),
rotate.as_ref(),
@@ -55,7 +54,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_rotate_right_integer_radix_kb_assign_async(
cuda_backend_unchecked_rotate_right_assign(
streams,
ct.as_mut(),
rotate.as_ref(),
@@ -148,7 +147,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_rotate_left_integer_radix_kb_assign_async(
cuda_backend_unchecked_rotate_left_assign(
streams,
ct.as_mut(),
rotate.as_ref(),
@@ -176,7 +175,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_rotate_left_integer_radix_kb_assign_async(
cuda_backend_unchecked_rotate_left_assign(
streams,
ct.as_mut(),
rotate.as_ref(),
@@ -574,23 +573,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -620,7 +621,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
let rotate_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_rotate_left_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_rotate_left_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -643,7 +644,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_rotate_left_integer_radix_kb_size_on_gpu(
cuda_backend_get_rotate_left_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -685,23 +686,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -731,7 +734,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
let rotate_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_rotate_right_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_rotate_right_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -754,7 +757,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_rotate_right_integer_radix_kb_size_on_gpu(
cuda_backend_get_rotate_right_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,

View File

@@ -8,8 +8,9 @@ use crate::integer::gpu::ciphertext::{
};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_propagate_single_carry_assign_async_size_on_gpu,
scalar_addition_integer_radix_assign_async, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_get_propagate_single_carry_assign_size_on_gpu,
cuda_backend_scalar_addition_assign, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::prelude::CastInto;
@@ -97,7 +98,7 @@ impl CudaServerKey {
// If the scalar is decomposed using less than the number of blocks our ciphertext
// has, we just don't touch ciphertext's last blocks
scalar_addition_integer_radix_assign_async(
cuda_backend_scalar_addition_assign(
streams,
ct.as_mut(),
&d_decomposed_scalar,
@@ -208,23 +209,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -246,7 +249,7 @@ impl CudaServerKey {
let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
let single_carry_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_propagate_single_carry_assign_async_size_on_gpu(
cuda_backend_get_propagate_single_carry_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
@@ -265,7 +268,7 @@ impl CudaServerKey {
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_propagate_single_carry_assign_async_size_on_gpu(
cuda_backend_get_propagate_single_carry_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),

View File

@@ -5,8 +5,8 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_scalar_bitop_integer_radix_kb_size_on_gpu,
unchecked_scalar_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_scalar_bitop_size_on_gpu,
cuda_backend_unchecked_scalar_bitop_assign, BitOpType, CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -36,7 +36,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_bitop_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_bitop_assign(
streams,
ct.as_mut(),
&clear_blocks,
@@ -65,7 +65,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_bitop_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_bitop_assign(
streams,
ct.as_mut(),
&clear_blocks,
@@ -315,23 +315,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -352,7 +354,7 @@ impl CudaServerKey {
let clear_blocks_mem = (lwe_ciphertext_count.0 * size_of::<u64>()) as u64;
let scalar_bitop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_bitop_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_bitop_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -375,7 +377,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_bitop_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_bitop_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,

View File

@@ -8,9 +8,9 @@ use crate::integer::gpu::ciphertext::info::CudaRadixCiphertextInfo;
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaRadixCiphertext};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
unchecked_are_all_comparisons_block_true_integer_radix_kb_async,
unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async,
unchecked_scalar_comparison_integer_radix_kb_async, ComparisonType, PBSType,
cuda_backend_unchecked_are_all_comparisons_block_true,
cuda_backend_unchecked_is_at_least_one_comparisons_block_true,
cuda_backend_unchecked_scalar_comparison, ComparisonType, PBSType,
};
use crate::shortint::ciphertext::Degree;
@@ -124,7 +124,7 @@ impl CudaServerKey {
ComparisonType::GT | ComparisonType::GE | ComparisonType::NE => 1,
_ => 0,
};
let ct_res: T = self.create_trivial_radix(value, 1, streams);
let ct_res: T = self.create_trivial_radix_async(value, 1, streams);
return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
}
@@ -146,7 +146,7 @@ impl CudaServerKey {
ComparisonType::LT | ComparisonType::LE | ComparisonType::NE => 1,
_ => 0,
};
let ct_res: T = self.create_trivial_radix(value, 1, streams);
let ct_res: T = self.create_trivial_radix_async(value, 1, streams);
return CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
}
@@ -173,7 +173,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
cuda_backend_unchecked_scalar_comparison(
streams,
result.as_mut().as_mut(),
ct.as_ref(),
@@ -204,7 +204,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
cuda_backend_unchecked_scalar_comparison(
streams,
result.as_mut().as_mut(),
ct.as_ref(),
@@ -261,9 +261,9 @@ impl CudaServerKey {
// Scalar is greater than the bounds, so ciphertext is smaller
let result: T = match op {
ComparisonType::LT | ComparisonType::LE => {
self.create_trivial_radix(1, num_blocks, streams)
self.create_trivial_radix_async(1, num_blocks, streams)
}
_ => self.create_trivial_radix(
_ => self.create_trivial_radix_async(
0,
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
streams,
@@ -275,9 +275,9 @@ impl CudaServerKey {
// Scalar is smaller than the bounds, so ciphertext is bigger
let result: T = match op {
ComparisonType::GT | ComparisonType::GE => {
self.create_trivial_radix(1, num_blocks, streams)
self.create_trivial_radix_async(1, num_blocks, streams)
}
_ => self.create_trivial_radix(
_ => self.create_trivial_radix_async(
0,
ct.as_ref().d_blocks.lwe_ciphertext_count().0,
streams,
@@ -296,7 +296,8 @@ impl CudaServerKey {
ct, scalar, op, true, streams,
)
} else {
let scalar_as_trivial = self.create_trivial_radix(scalar, num_blocks, streams);
let scalar_as_trivial =
self.create_trivial_radix_async(scalar, num_blocks, streams);
self.unchecked_comparison_async(ct, &scalar_as_trivial, op, streams)
}
} else {
@@ -334,7 +335,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
cuda_backend_unchecked_scalar_comparison(
streams,
result.as_mut(),
ct.as_ref(),
@@ -365,7 +366,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_comparison_integer_radix_kb_async(
cuda_backend_unchecked_scalar_comparison(
streams,
result.as_mut(),
ct.as_ref(),
@@ -412,7 +413,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_are_all_comparisons_block_true_integer_radix_kb_async(
cuda_backend_unchecked_are_all_comparisons_block_true(
streams,
boolean_res.as_mut().as_mut(),
ct.as_ref(),
@@ -438,7 +439,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_are_all_comparisons_block_true_integer_radix_kb_async(
cuda_backend_unchecked_are_all_comparisons_block_true(
streams,
boolean_res.as_mut().as_mut(),
ct.as_ref(),
@@ -482,7 +483,7 @@ impl CudaServerKey {
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async(
cuda_backend_unchecked_is_at_least_one_comparisons_block_true(
streams,
boolean_res.as_mut().as_mut(),
ct.as_ref(),
@@ -508,7 +509,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_is_at_least_one_comparisons_block_true_integer_radix_kb_async(
cuda_backend_unchecked_is_at_least_one_comparisons_block_true(
streams,
boolean_res.as_mut().as_mut(),
ct.as_ref(),

View File

@@ -6,14 +6,14 @@ use crate::integer::gpu::ciphertext::{
};
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_scalar_div_integer_radix_kb_size_on_gpu,
get_scalar_div_rem_integer_radix_kb_size_on_gpu,
get_signed_scalar_div_integer_radix_kb_size_on_gpu,
get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu,
unchecked_signed_scalar_div_integer_radix_kb_assign_async,
unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async,
unchecked_unsigned_scalar_div_integer_radix_kb_assign_async,
unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async, CudaServerKey, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_get_scalar_div_rem_size_on_gpu, cuda_backend_get_scalar_div_size_on_gpu,
cuda_backend_get_signed_scalar_div_rem_size_on_gpu,
cuda_backend_get_signed_scalar_div_size_on_gpu,
cuda_backend_unchecked_signed_scalar_div_assign,
cuda_backend_unchecked_signed_scalar_div_rem_assign,
cuda_backend_unchecked_unsigned_scalar_div_assign,
cuda_backend_unchecked_unsigned_scalar_div_rem, CudaServerKey, PBSType,
};
use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
use crate::integer::server_key::radix_parallel::OutputFlag;
@@ -106,7 +106,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_unsigned_scalar_div_integer_radix_kb_assign_async(
cuda_backend_unchecked_unsigned_scalar_div_assign(
streams,
quotient.as_mut(),
divisor,
@@ -127,7 +127,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_unsigned_scalar_div_integer_radix_kb_assign_async(
cuda_backend_unchecked_unsigned_scalar_div_assign(
streams,
quotient.as_mut(),
divisor,
@@ -281,7 +281,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async(
cuda_backend_unchecked_unsigned_scalar_div_rem(
streams,
quotient.as_mut(),
remainder.as_mut(),
@@ -303,7 +303,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_unsigned_scalar_div_rem_integer_radix_kb_assign_async(
cuda_backend_unchecked_unsigned_scalar_div_rem(
streams,
quotient.as_mut(),
remainder.as_mut(),
@@ -549,11 +549,11 @@ impl CudaServerKey {
>= to the number of bits encrypted in the ciphertext"
);
let mut quotient: CudaSignedRadixCiphertext = numerator.duplicate_async(streams);
let mut quotient: CudaSignedRadixCiphertext = numerator.duplicate(streams);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_signed_scalar_div_integer_radix_kb_assign_async(
cuda_backend_unchecked_signed_scalar_div_assign(
streams,
quotient.as_mut(),
divisor,
@@ -574,7 +574,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_signed_scalar_div_integer_radix_kb_assign_async(
cuda_backend_unchecked_signed_scalar_div_assign(
streams,
quotient.as_mut(),
divisor,
@@ -729,7 +729,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async(
cuda_backend_unchecked_signed_scalar_div_rem_assign(
streams,
quotient.as_mut(),
remainder.as_mut(),
@@ -751,7 +751,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_signed_scalar_div_rem_integer_radix_kb_assign_async(
cuda_backend_unchecked_signed_scalar_div_rem_assign(
streams,
quotient.as_mut(),
remainder.as_mut(),
@@ -885,8 +885,7 @@ impl CudaServerKey {
Scalar: SignedReciprocable + ScalarMultiplier + DecomposableInto<u8> + CastInto<u64>,
<<Scalar as SignedReciprocable>::Unsigned as Reciprocable>::DoublePrecision: Send,
{
let (_, remainder) =
self.unchecked_signed_scalar_div_rem_async(numerator, divisor, streams);
let (_, remainder) = self.unchecked_signed_scalar_div_rem(numerator, divisor, streams);
remainder
}
@@ -992,23 +991,25 @@ encrypted bits: {numerator_bits}, scalar bits: {}
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -1028,7 +1029,7 @@ encrypted bits: {numerator_bits}, scalar bits: {}
};
let scalar_div_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_div_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_div_size_on_gpu(
streams,
divisor,
self.message_modulus,
@@ -1046,7 +1047,7 @@ encrypted bits: {numerator_bits}, scalar bits: {}
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_div_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_div_size_on_gpu(
streams,
divisor,
self.message_modulus,
@@ -1092,46 +1093,42 @@ encrypted bits: {numerator_bits}, scalar bits: {}
Scalar::BITS
);
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_scalar_div_rem_integer_radix_kb_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
num_blocks,
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_div_rem_integer_radix_kb_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
num_blocks,
PBSType::MultiBit,
None,
)
}
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_div_rem_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
num_blocks,
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
cuda_backend_get_scalar_div_rem_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
num_blocks,
PBSType::MultiBit,
None,
)
}
}
}
@@ -1174,27 +1171,25 @@ encrypted bits: {numerator_bits}, scalar bits: {}
);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_signed_scalar_div_integer_radix_kb_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
d_bsk.decomp_base_log,
d_bsk.decomp_level_count,
self.key_switching_key.decomposition_base_log(),
self.key_switching_key.decomposition_level_count(),
LweBskGroupingFactor(0),
num_blocks,
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_signed_scalar_div_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
d_bsk.decomp_base_log,
d_bsk.decomp_level_count,
self.key_switching_key.decomposition_base_log(),
self.key_switching_key.decomposition_level_count(),
LweBskGroupingFactor(0),
num_blocks,
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_signed_scalar_div_integer_radix_kb_size_on_gpu(
cuda_backend_get_signed_scalar_div_size_on_gpu(
streams,
divisor,
self.message_modulus,
@@ -1236,46 +1231,44 @@ encrypted bits: {numerator_bits}, scalar bits: {}
>= to the number of bits encrypted in the ciphertext"
);
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
num_blocks,
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_signed_scalar_div_rem_integer_radix_kb_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
num_blocks,
PBSType::MultiBit,
None,
)
}
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_signed_scalar_div_rem_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
num_blocks,
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
cuda_backend_get_signed_scalar_div_rem_size_on_gpu(
streams,
divisor,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
num_blocks,
PBSType::MultiBit,
None,
)
}
}
}

View File

@@ -4,8 +4,8 @@ use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_scalar_mul_integer_radix_kb_size_on_gpu,
unchecked_scalar_mul_integer_radix_kb_async, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_scalar_mul_size_on_gpu,
cuda_backend_unchecked_scalar_mul, PBSType,
};
use crate::integer::server_key::ScalarMultiplier;
use crate::prelude::CastInto;
@@ -114,7 +114,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_mul_integer_radix_kb_async(
cuda_backend_unchecked_scalar_mul(
streams,
ct.as_mut(),
decomposed_scalar.as_slice(),
@@ -139,7 +139,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_mul_integer_radix_kb_async(
cuda_backend_unchecked_scalar_mul(
streams,
ct.as_mut(),
decomposed_scalar.as_slice(),
@@ -286,23 +286,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -328,7 +330,7 @@ impl CudaServerKey {
return 0;
}
let scalar_mul_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_mul_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_mul_size_on_gpu(
streams,
decomposed_scalar.as_slice(),
self.message_modulus,
@@ -348,7 +350,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_mul_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_mul_size_on_gpu(
streams,
decomposed_scalar.as_slice(),
self.message_modulus,

View File

@@ -3,10 +3,11 @@ use crate::core_crypto::prelude::{CastFrom, LweBskGroupingFactor};
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_scalar_rotate_left_integer_radix_kb_size_on_gpu,
get_scalar_rotate_right_integer_radix_kb_size_on_gpu,
unchecked_scalar_rotate_left_integer_radix_kb_assign_async,
unchecked_scalar_rotate_right_integer_radix_kb_assign_async, CudaServerKey, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_get_scalar_rotate_left_size_on_gpu,
cuda_backend_unchecked_scalar_rotate_left_assign,
cuda_backend_unchecked_scalar_rotate_right_assign, get_scalar_rotate_right_size_on_gpu,
CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -47,7 +48,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_rotate_left_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_rotate_left_assign(
stream,
ct.as_mut(),
u32::cast_from(n),
@@ -74,7 +75,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_rotate_left_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_rotate_left_assign(
stream,
ct.as_mut(),
u32::cast_from(n),
@@ -156,7 +157,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_rotate_right_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_rotate_right_assign(
stream,
ct.as_mut(),
u32::cast_from(n),
@@ -183,7 +184,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_rotate_right_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_rotate_right_assign(
stream,
ct.as_mut(),
u32::cast_from(n),
@@ -287,23 +288,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -323,7 +326,7 @@ impl CudaServerKey {
};
let scalar_shift_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_scalar_rotate_left_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_rotate_left_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -346,7 +349,7 @@ impl CudaServerKey {
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_rotate_left_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_rotate_left_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -382,23 +385,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -417,52 +422,48 @@ impl CudaServerKey {
}
};
let scalar_shift_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_scalar_rotate_right_integer_radix_kb_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_rotate_right_integer_radix_kb_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
)
}
CudaBootstrappingKey::Classic(d_bsk) => get_scalar_rotate_right_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => get_scalar_rotate_right_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
),
};
full_prop_mem.max(scalar_shift_mem)
}

View File

@@ -3,13 +3,13 @@ use crate::core_crypto::prelude::{CastFrom, LweBskGroupingFactor};
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu,
get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu,
get_scalar_left_shift_integer_radix_kb_size_on_gpu,
get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu,
unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async,
unchecked_scalar_left_shift_integer_radix_kb_assign_async,
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async, CudaServerKey, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu,
cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu,
cuda_backend_get_scalar_left_shift_size_on_gpu,
cuda_backend_get_scalar_logical_right_shift_size_on_gpu,
cuda_backend_unchecked_scalar_arithmetic_right_shift_assign,
cuda_backend_unchecked_scalar_left_shift_assign,
cuda_backend_unchecked_scalar_logical_right_shift_assign, CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -51,7 +51,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_left_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_left_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -78,7 +78,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_left_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_left_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -198,7 +198,7 @@ impl CudaServerKey {
if T::IS_SIGNED {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_arithmetic_right_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -224,7 +224,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_arithmetic_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_arithmetic_right_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -253,7 +253,7 @@ impl CudaServerKey {
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_logical_right_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -280,7 +280,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_logical_right_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -596,7 +596,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_logical_right_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -623,7 +623,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_scalar_logical_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_scalar_logical_right_shift_assign(
streams,
ct.as_mut(),
u32::cast_from(shift),
@@ -662,23 +662,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -697,31 +699,29 @@ impl CudaServerKey {
}
};
let scalar_shift_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_scalar_left_shift_integer_radix_kb_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_scalar_left_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key
.input_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
lwe_ciphertext_count.0 as u32,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_left_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_left_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -757,23 +757,25 @@ impl CudaServerKey {
0
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -794,7 +796,7 @@ impl CudaServerKey {
let scalar_shift_mem = if T::IS_SIGNED {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -817,7 +819,7 @@ impl CudaServerKey {
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_arithmetic_right_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_arithmetic_right_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -843,7 +845,7 @@ impl CudaServerKey {
} else {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_logical_right_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -866,7 +868,7 @@ impl CudaServerKey {
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_scalar_logical_right_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_scalar_logical_right_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,

View File

@@ -3,10 +3,9 @@ use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
get_full_propagate_assign_size_on_gpu, get_left_shift_integer_radix_kb_size_on_gpu,
get_right_shift_integer_radix_kb_size_on_gpu,
unchecked_left_shift_integer_radix_kb_assign_async,
unchecked_right_shift_integer_radix_kb_assign_async, CudaServerKey, PBSType,
cuda_backend_get_full_propagate_assign_size_on_gpu, cuda_backend_get_left_shift_size_on_gpu,
cuda_backend_get_right_shift_size_on_gpu, cuda_backend_unchecked_left_shift_assign,
cuda_backend_unchecked_right_shift_assign, CudaServerKey, PBSType,
};
impl CudaServerKey {
@@ -27,7 +26,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_right_shift_assign(
streams,
ct.as_mut(),
shift.as_ref(),
@@ -55,7 +54,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_right_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_right_shift_assign(
streams,
ct.as_mut(),
shift.as_ref(),
@@ -146,7 +145,7 @@ impl CudaServerKey {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_left_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_left_shift_assign(
streams,
ct.as_mut(),
shift.as_ref(),
@@ -174,7 +173,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_left_shift_integer_radix_kb_assign_async(
cuda_backend_unchecked_left_shift_assign(
streams,
ct.as_mut(),
shift.as_ref(),
@@ -569,23 +568,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -615,7 +616,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
let shift_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_left_shift_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_left_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -638,7 +639,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_left_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_left_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -680,23 +681,25 @@ impl CudaServerKey {
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
);
let full_prop_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_bsk.input_lwe_dimension(),
d_bsk.glwe_dimension(),
d_bsk.polynomial_size(),
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count(),
d_bsk.decomp_base_log(),
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_full_propagate_assign_size_on_gpu(
cuda_backend_get_full_propagate_assign_size_on_gpu(
streams,
d_multibit_bsk.input_lwe_dimension(),
d_multibit_bsk.glwe_dimension(),
@@ -726,7 +729,7 @@ impl CudaServerKey {
let lwe_ciphertext_count = ct_left.as_ref().d_blocks.lwe_ciphertext_count();
let shift_mem = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_right_shift_integer_radix_kb_size_on_gpu(
CudaBootstrappingKey::Classic(d_bsk) => cuda_backend_get_right_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
@@ -749,7 +752,7 @@ impl CudaServerKey {
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_right_shift_integer_radix_kb_size_on_gpu(
cuda_backend_get_right_shift_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,

View File

@@ -8,8 +8,8 @@ use crate::integer::gpu::server_key::CudaServerKey;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
sub_and_propagate_single_carry_assign_async,
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async, PBSType,
cuda_backend_sub_and_propagate_single_carry_assign,
cuda_backend_unchecked_unsigned_overflowing_sub_assign, PBSType,
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::parameters::LweBskGroupingFactor;
@@ -264,7 +264,7 @@ impl CudaServerKey {
) {
(true, true) => (ct_left, ct_right),
(true, false) => {
tmp_rhs = ct_right.duplicate_async(streams);
tmp_rhs = ct_right.duplicate(streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
(ct_left, &tmp_rhs)
}
@@ -273,7 +273,7 @@ impl CudaServerKey {
(ct_left, ct_right)
}
(false, false) => {
tmp_rhs = ct_right.duplicate_async(streams);
tmp_rhs = ct_right.duplicate(streams);
self.full_propagate_assign_async(ct_left, streams);
self.full_propagate_assign_async(&mut tmp_rhs, streams);
@@ -281,13 +281,8 @@ impl CudaServerKey {
}
};
let _carry = self.sub_and_propagate_single_carry_assign_async(
lhs,
rhs,
streams,
None,
OutputFlag::None,
);
let _carry =
self.sub_and_propagate_single_carry_assign(lhs, rhs, streams, None, OutputFlag::None);
}
pub fn get_sub_assign_size_on_gpu<T: CudaIntegerRadixCiphertext>(
@@ -314,22 +309,22 @@ impl CudaServerKey {
(true, true) => (ct_left, ct_right),
(true, false) => {
unsafe {
tmp_rhs = ct_right.duplicate_async(stream);
tmp_rhs = ct_right.duplicate(stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
}
(ct_left, &tmp_rhs)
}
(false, true) => {
unsafe {
tmp_lhs = ct_left.duplicate_async(stream);
tmp_lhs = ct_left.duplicate(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
}
(&tmp_lhs, ct_right)
}
(false, false) => {
unsafe {
tmp_lhs = ct_left.duplicate_async(stream);
tmp_rhs = ct_right.duplicate_async(stream);
tmp_lhs = ct_left.duplicate(stream);
tmp_rhs = ct_right.duplicate(stream);
self.full_propagate_assign_async(&mut tmp_lhs, stream);
self.full_propagate_assign_async(&mut tmp_rhs, stream);
@@ -383,17 +378,18 @@ impl CudaServerKey {
const INPUT_BORROW: Option<&CudaBooleanBlock> = None;
let mut overflow_block: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(1, stream);
self.create_trivial_zero_radix_async(1, stream);
let ciphertext = ct_res.as_mut();
let uses_input_borrow = INPUT_BORROW.map_or(0u32, |_block| 1u32);
let aux_block: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, stream);
let aux_block: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix_async(1, stream);
let in_carry_dvec =
INPUT_BORROW.map_or_else(|| aux_block.as_ref(), |block| block.as_ref().as_ref());
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
cuda_backend_unchecked_unsigned_overflowing_sub_assign(
stream,
ciphertext,
rhs.as_ref(),
@@ -418,7 +414,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_unsigned_overflowing_sub_integer_radix_kb_assign_async(
cuda_backend_unchecked_unsigned_overflowing_sub_assign(
stream,
ciphertext,
rhs.as_ref(),
@@ -452,7 +448,7 @@ impl CudaServerKey {
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
/// not be dropped until streams is synchronized
pub(crate) unsafe fn sub_and_propagate_single_carry_assign_async<T>(
pub(crate) unsafe fn sub_and_propagate_single_carry_assign<T>(
&self,
lhs: &mut T,
rhs: &T,
@@ -463,17 +459,17 @@ impl CudaServerKey {
where
T: CudaIntegerRadixCiphertext,
{
let mut carry_out: T = self.create_trivial_zero_radix(1, streams);
let mut carry_out: T = self.create_trivial_zero_radix_async(1, streams);
let num_blocks = lhs.as_mut().d_blocks.lwe_ciphertext_count().0 as u32;
let uses_carry = input_carry.map_or(0u32, |_block| 1u32);
let aux_block: T = self.create_trivial_zero_radix(1, streams);
let aux_block: T = self.create_trivial_zero_radix_async(1, streams);
let in_carry: &CudaRadixCiphertext =
input_carry.map_or_else(|| aux_block.as_ref(), |block| block.0.as_ref());
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
sub_and_propagate_single_carry_assign_async(
cuda_backend_sub_and_propagate_single_carry_assign(
streams,
lhs.as_mut(),
rhs.as_ref(),
@@ -499,7 +495,7 @@ impl CudaServerKey {
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
sub_and_propagate_single_carry_assign_async(
cuda_backend_sub_and_propagate_single_carry_assign(
streams,
lhs.as_mut(),
rhs.as_ref(),