chore(gpu): remove support for drift noise reduction

This commit is contained in:
Agnes Leroy
2025-10-01 13:59:45 +02:00
committed by Agnès Leroy
parent f3cddb5635
commit f9e876730a
68 changed files with 748 additions and 1900 deletions

View File

@@ -125,9 +125,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
void cuda_apply_univariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks);
void *const *ksks, void *const *bsks);
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -145,9 +143,8 @@ void cuda_apply_bivariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe_1,
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift);
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift);
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -155,9 +152,8 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
void cuda_apply_many_univariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_luts, uint32_t lut_stride);
void *const *ksks, void *const *bsks, uint32_t num_luts,
uint32_t lut_stride);
uint64_t scratch_cuda_full_propagation_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
@@ -167,11 +163,10 @@ uint64_t scratch_cuda_full_propagation_64(
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_full_propagation_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_blocks);
void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks);
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -189,9 +184,8 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
uint32_t polynomial_size, uint32_t num_blocks);
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -216,8 +210,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
@@ -230,8 +223,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -251,8 +243,7 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *ksks);
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -271,16 +262,13 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *bsks, void *const *ksks);
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_scalar_blocks);
void *const *ksks, uint32_t num_scalar_blocks);
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -298,15 +286,13 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *bsks, void *const *ksks);
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -324,8 +310,7 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -341,8 +326,7 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -369,16 +353,13 @@ void cuda_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry);
void cuda_add_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry);
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -400,9 +381,8 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t compute_overflow, uint32_t uses_input_borrow);
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
uint32_t uses_input_borrow);
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -420,8 +400,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *ksks);
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -438,7 +417,6 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
@@ -457,8 +435,7 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -475,9 +452,7 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_blocks);
void *const *ksks, void *const *bsks, uint32_t num_blocks);
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -496,8 +471,7 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
bool is_signed, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
bool is_signed, void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -514,9 +488,7 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
void cuda_integer_are_all_comparisons_block_true_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks);
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
void cleanup_cuda_integer_are_all_comparisons_block_true(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -533,9 +505,7 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks);
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -561,9 +531,7 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
void cuda_apply_noise_squashing_kb(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks);
void *const *ksks, void *const *bsks);
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -581,9 +549,7 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry);
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -600,7 +566,6 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
void cuda_integer_unsigned_scalar_div_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi);
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
@@ -615,11 +580,12 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_extend_radix_with_sign_msb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr,
uint32_t num_additional_blocks,
void *const *bsks, void *const *ksks);
void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -636,7 +602,6 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
void cuda_integer_signed_scalar_div_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
@@ -655,9 +620,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
void const *clear_blocks, void const *h_clear_blocks,
@@ -679,9 +642,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits);
@@ -701,8 +662,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
void cuda_integer_count_of_consecutive_bits_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);
void *const *ksks);
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -716,11 +676,11 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
bool allocate_gpu_memory, uint32_t message_bits_per_block,
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_grouped_oprf_async_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
void *const *bsks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input,
uint32_t num_blocks_to_process,
int8_t *mem, void *const *bsks);
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
@@ -740,8 +700,7 @@ void cuda_integer_ilog2_kb_64(
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);
void *const *bsks, void *const *ksks);
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

View File

@@ -3,16 +3,6 @@
#include <stdint.h>
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, DRIFT = 1, CENTERED = 2 };
extern "C" {
typedef struct {
void *const *ptr;
uint32_t num_zeros;
double ms_bound;
double ms_r_sigma;
double ms_input_variance;
} CudaModulusSwitchNoiseReductionKeyFFI;
}
enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, CENTERED = 1 };
#endif // CUDA_PBS_ENUMS_H

View File

@@ -80,7 +80,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
Torus *global_accumulator;
double2 *global_join_buffer;
Torus *temp_lwe_array_in;
PBS_VARIANT pbs_variant;
PBS_MS_REDUCTION_T noise_reduction_type;
@@ -97,10 +96,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
this->temp_lwe_array_in = (Torus *)cuda_malloc_with_size_tracking_async(
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(Torus),
stream, gpu_index, size_tracker,
noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT);
switch (pbs_variant) {
case PBS_VARIANT::DEFAULT: {
uint64_t full_sm_step_one =
@@ -234,10 +229,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
if (pbs_variant == DEFAULT)
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
gpu_memory_allocated);
if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
gpu_memory_allocated);
}
};
@@ -249,8 +240,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
__uint128_t *global_accumulator;
double *global_join_buffer;
InputTorus *temp_lwe_array_in;
uint64_t *trivial_indexes;
PBS_VARIANT pbs_variant;
PBS_MS_REDUCTION_T noise_reduction_type;
@@ -268,27 +257,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
this->temp_lwe_array_in =
(InputTorus *)cuda_malloc_with_size_tracking_async(
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(InputTorus),
stream, gpu_index, size_tracker, allocate_gpu_memory);
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
size_tracker, allocate_gpu_memory);
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
h_trivial_indexes[i] = i;
cuda_memcpy_with_size_tracking_async_to_gpu(
trivial_indexes, h_trivial_indexes,
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
allocate_gpu_memory);
cuda_synchronize_stream(stream, gpu_index);
delete[] h_trivial_indexes;
}
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
@@ -424,13 +392,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
if (pbs_variant == DEFAULT)
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
gpu_memory_allocated);
if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
gpu_memory_allocated);
}
}
};

View File

@@ -89,18 +89,14 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lut_vector, void const *lwe_array_in,
void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);

View File

@@ -22,8 +22,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
void cuda_expand_without_verification_64(
CudaStreamsFFI streams, void *lwe_array_out,
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void *const *bsks, void *const *computing_ksks, void *const *casting_keys);
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

View File

@@ -103,23 +103,6 @@ void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
lwe_dimension, log_modulus);
}
// This end point is used only for testing purposes
// its output always follows trivial ordering
void cuda_improve_noise_modulus_switch_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *lwe_array_indexes,
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
uint32_t log_modulus) {
host_drift_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t const *>(lwe_array_indexes),
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
num_zeros, input_variance, r_sigma, bound, log_modulus);
}
void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,

View File

@@ -426,31 +426,4 @@ __global__ void __launch_bounds__(512)
}
}
template <typename Torus>
__host__ void host_drift_modulus_switch(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
const double input_variance, const double r_sigma, const double bound,
uint32_t log_modulus) {
PANIC_IF_FALSE(lwe_size >= 512,
"The lwe_size (%d) is less than 512, this is not supported\n",
lwe_size);
PANIC_IF_FALSE(
lwe_size <= 1024,
"The lwe_size (%d) is greater than 1024, this is not supported\n",
lwe_size);
cuda_set_device(gpu_index);
// This reduction requires a power of two num of threads
int num_threads = 512, num_blocks = num_lwes;
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
r_sigma, bound, log_modulus);
check_cuda_error(cudaGetLastError());
}
#endif // CNCRT_TORUS_H

View File

@@ -21,14 +21,12 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
bool is_signed, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
bool is_signed, void *const *bsks, void *const *ksks) {
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key,
mem, is_signed);
(uint64_t **)(ksks), mem, is_signed);
}
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,

View File

@@ -30,11 +30,10 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
}
template <typename Torus>
__host__ void host_integer_abs_kb(
CudaStreams streams, CudaRadixCiphertextFFI *ct, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
__host__ void
host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
if (!is_signed)
return;
@@ -49,19 +48,19 @@ __host__ void host_integer_abs_kb(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
ct->num_radix_blocks, mem_ptr->params.message_modulus,
mem_ptr->params.carry_modulus);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
ms_noise_reduction_key, requested_flag, uses_carry);
host_propagate_single_carry<Torus>(streams, ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks,
requested_flag, uses_carry);
host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
bsks, ksks, ms_noise_reduction_key);
bsks, ksks);
}
#endif // TFHE_RS_ABS_CUH

View File

@@ -23,13 +23,11 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, void *const *ksks) {
host_integer_radix_bitop_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key);
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
}
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

View File

@@ -16,8 +16,7 @@ __host__ void host_integer_radix_bitop_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, Torus *const *ksks) {
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
@@ -45,9 +44,8 @@ __host__ void host_integer_radix_bitop_kb(
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
lut->params.message_modulus);
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, lut,
lwe_array_out->num_radix_blocks, lut->params.message_modulus);
memcpy(lwe_array_out->degrees, degrees,
lwe_array_out->num_radix_blocks * sizeof(uint64_t));

View File

@@ -35,16 +35,17 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
num_blocks, num_additional_blocks, allocate_gpu_memory);
}
void cuda_extend_radix_with_sign_msb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr,
uint32_t num_additional_blocks,
void *const *bsks, void *const *ksks) {
PUSH_RANGE("cast")
host_extend_radix_with_sign_msb<uint64_t>(
CudaStreams(streams), output, input,
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
num_additional_blocks, bsks, (uint64_t **)ksks);
POP_RANGE()
}

View File

@@ -55,8 +55,7 @@ __host__ void host_extend_radix_with_sign_msb(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks) {
if (num_additional_blocks == 0) {
PUSH_RANGE("cast/extend no addblocks")
@@ -79,8 +78,7 @@ __host__ void host_extend_radix_with_sign_msb(
input_blocks - 1, input_blocks);
host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks,
ms_noise_reduction_key, bsks);
mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
uint32_t dst_block_idx = input_blocks + i;

View File

@@ -29,13 +29,12 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, void *const *ksks) {
PUSH_RANGE("cmux")
host_integer_radix_cmux_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
(uint64_t **)(ksks));
POP_RANGE()
}

View File

@@ -5,15 +5,13 @@
#include "radix_ciphertext.cuh"
template <typename Torus>
__host__ void
zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input,
CudaRadixCiphertextFFI const *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
__host__ void zero_out_if(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input,
CudaRadixCiphertextFFI const *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks >= num_radix_blocks &&
lwe_array_input->num_radix_blocks >= num_radix_blocks,
@@ -38,8 +36,8 @@ zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks,
ms_noise_reduction_key, predicate, num_radix_blocks);
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks, predicate,
num_radix_blocks);
}
template <typename Torus>
@@ -48,8 +46,7 @@ __host__ void host_integer_radix_cmux_kb(
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -73,8 +70,8 @@ __host__ void host_integer_radix_cmux_kb(
}
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);
mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
2 * num_radix_blocks, params.message_modulus);
// If the condition was true, true_ct will have kept its value and false_ct
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
@@ -91,7 +88,7 @@ __host__ void host_integer_radix_cmux_kb(
params.message_modulus, params.carry_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, &mem_true, bsks, ksks, ms_noise_reduction_key,
streams, lwe_array_out, &mem_true, bsks, ksks,
mem_ptr->message_extract_lut, num_radix_blocks);
}

View File

@@ -41,8 +41,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, void *const *ksks) {
PUSH_RANGE("comparison")
if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
PANIC("Cuda error: input num radix blocks must be the same")
@@ -57,7 +56,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
case NE:
host_integer_radix_equality_check_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
bsks, (uint64_t **)(ksks), num_radix_blocks);
break;
case GT:
case GE:
@@ -69,7 +68,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_difference_check_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks);
num_radix_blocks);
break;
case MAX:
case MIN:
@@ -77,7 +76,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
host_integer_radix_maxmin_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
bsks, (uint64_t **)(ksks), num_radix_blocks);
break;
default:
PANIC("Cuda error: integer operation not supported")
@@ -118,16 +117,14 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
void cuda_integer_are_all_comparisons_block_true_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
(uint64_t **)(ksks), num_radix_blocks);
}
void cleanup_cuda_integer_are_all_comparisons_block_true(
@@ -162,16 +159,14 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
(uint64_t **)(ksks), num_radix_blocks);
}
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(

View File

@@ -61,9 +61,7 @@ __host__ void are_all_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -158,8 +156,7 @@ __host__ void are_all_comparisons_block_true(
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, accumulator, bsks, ksks,
ms_noise_reduction_key, lut, 1);
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
memset(is_max_value_lut->h_lut_indexes, 0,
@@ -176,8 +173,7 @@ __host__ void are_all_comparisons_block_true(
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, tmp_out, accumulator, bsks, ksks, ms_noise_reduction_key,
lut, num_chunks);
streams, tmp_out, accumulator, bsks, ksks, lut, num_chunks);
}
}
}
@@ -193,9 +189,7 @@ __host__ void is_at_least_one_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -249,12 +243,12 @@ __host__ void is_at_least_one_comparisons_block_true(
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
ms_noise_reduction_key, lut, 1);
lut, 1);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
bsks, ksks, ms_noise_reduction_key, lut, num_chunks);
bsks, ksks, lut, num_chunks);
}
}
}
@@ -264,9 +258,8 @@ __host__ void host_compare_blocks_with_zero(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
Torus *const *ksks, int32_t num_radix_blocks,
int_radix_lut<Torus> *zero_comparison) {
if (num_radix_blocks == 0)
return;
@@ -322,8 +315,7 @@ __host__ void host_compare_blocks_with_zero(
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, sum, bsks, ksks, ms_noise_reduction_key,
zero_comparison, num_sum_blocks);
streams, lwe_array_out, sum, bsks, ksks, zero_comparison, num_sum_blocks);
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
}
@@ -334,9 +326,7 @@ __host__ void host_integer_radix_equality_check_kb(
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -347,16 +337,15 @@ __host__ void host_integer_radix_equality_check_kb(
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
ms_noise_reduction_key, eq_buffer->operator_lut, num_radix_blocks,
eq_buffer->operator_lut, num_radix_blocks,
eq_buffer->operator_lut->params.message_modulus);
// This takes a Vec of blocks, where each block is either 0 or 1.
//
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true<Torus>(
streams, lwe_array_out, comparisons, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks);
are_all_comparisons_block_true<Torus>(streams, lwe_array_out, comparisons,
mem_ptr, bsks, ksks, num_radix_blocks);
}
template <typename Torus>
@@ -365,9 +354,7 @@ __host__ void compare_radix_blocks_kb(
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -400,8 +387,8 @@ __host__ void compare_radix_blocks_kb(
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
is_non_zero_lut, num_radix_blocks);
streams, lwe_array_out, lwe_array_out, bsks, ksks, is_non_zero_lut,
num_radix_blocks);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
@@ -414,14 +401,13 @@ __host__ void compare_radix_blocks_kb(
// (inferior, equal, superior) to one single shortint block containing the
// final sign
template <typename Torus>
__host__ void tree_sign_reduction(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
__host__ void
tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f,
void *const *bsks, Torus *const *ksks,
uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_block_comparisons->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -454,8 +440,7 @@ __host__ void tree_sign_reduction(
partial_block_count, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, x, y, bsks, ksks, ms_noise_reduction_key, inner_tree_leaf,
partial_block_count >> 1);
streams, x, y, bsks, ksks, inner_tree_leaf, partial_block_count >> 1);
if ((partial_block_count % 2) != 0) {
partial_block_count >>= 1;
@@ -501,8 +486,7 @@ __host__ void tree_sign_reduction(
// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, y, bsks, ksks, ms_noise_reduction_key, last_lut,
1);
streams, lwe_array_out, y, bsks, ksks, last_lut, 1);
}
template <typename Torus>
@@ -512,9 +496,7 @@ __host__ void host_integer_radix_difference_check_kb(
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -554,7 +536,7 @@ __host__ void host_integer_radix_difference_check_kb(
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
ms_noise_reduction_key, identity_lut, 2 * packed_num_radix_blocks);
identity_lut, 2 * packed_num_radix_blocks);
} else {
as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
lwe_array_left->num_radix_blocks);
@@ -572,16 +554,14 @@ __host__ void host_integer_radix_difference_check_kb(
// Compare packed blocks, or simply the total number of radix blocks in the
// inputs
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, ms_noise_reduction_key,
packed_num_radix_blocks);
bsks, ksks, packed_num_radix_blocks);
num_comparisons = packed_num_radix_blocks;
} else {
// Packing is possible
if (carry_modulus >= message_modulus) {
// Compare (num_radix_blocks - 2) / 2 packed blocks
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
bsks, ksks, ms_noise_reduction_key,
packed_num_radix_blocks);
bsks, ksks, packed_num_radix_blocks);
// Compare the last block before the sign block separately
auto identity_lut = mem_ptr->identity_lut;
@@ -595,7 +575,7 @@ __host__ void host_integer_radix_difference_check_kb(
num_radix_blocks - 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
bsks, ksks, ms_noise_reduction_key, identity_lut, 1);
bsks, ksks, identity_lut, 1);
CudaRadixCiphertextFFI last_right_block_before_sign_block;
as_radix_ciphertext_slice<Torus>(
@@ -608,8 +588,7 @@ __host__ void host_integer_radix_difference_check_kb(
num_radix_blocks - 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, &last_right_block_before_sign_block,
&shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
identity_lut, 1);
&shifted_lwe_array_right, bsks, ksks, identity_lut, 1);
CudaRadixCiphertextFFI shifted_comparisons;
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -617,8 +596,7 @@ __host__ void host_integer_radix_difference_check_kb(
packed_num_radix_blocks + 1);
compare_radix_blocks_kb<Torus>(
streams, &shifted_comparisons, &last_left_block_before_sign_block,
&last_right_block_before_sign_block, mem_ptr, bsks, ksks,
ms_noise_reduction_key, 1);
&last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);
// Compare the sign block separately
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -632,14 +610,14 @@ __host__ void host_integer_radix_difference_check_kb(
num_radix_blocks - 1, num_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, &shifted_comparisons, &last_left_block, &last_right_block,
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
bsks, ksks, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
num_comparisons = packed_num_radix_blocks + 2;
} else {
compare_radix_blocks_kb<Torus>(
streams, comparisons, lwe_array_left, lwe_array_right, mem_ptr, bsks,
ksks, ms_noise_reduction_key, num_radix_blocks - 1);
compare_radix_blocks_kb<Torus>(streams, comparisons, lwe_array_left,
lwe_array_right, mem_ptr, bsks, ksks,
num_radix_blocks - 1);
// Compare the sign block separately
CudaRadixCiphertextFFI shifted_comparisons;
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -652,7 +630,7 @@ __host__ void host_integer_radix_difference_check_kb(
num_radix_blocks - 1, num_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, &shifted_comparisons, &last_left_block, &last_right_block,
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
bsks, ksks, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
num_comparisons = num_radix_blocks;
}
@@ -661,9 +639,9 @@ __host__ void host_integer_radix_difference_check_kb(
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction<Torus>(
streams, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsks, ksks, ms_noise_reduction_key, num_comparisons);
tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f,
bsks, ksks, num_comparisons);
}
template <typename Torus>
@@ -685,9 +663,7 @@ __host__ void host_integer_radix_maxmin_kb(
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -701,14 +677,12 @@ __host__ void host_integer_radix_maxmin_kb(
// Compute the sign
host_integer_radix_difference_check_kb<Torus>(
streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks);
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks);
// Selector
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks, ms_noise_reduction_key);
host_integer_radix_cmux_kb<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
}
template <typename Torus>
@@ -716,15 +690,12 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true<Torus>(
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks);
are_all_comparisons_block_true<Torus>(streams, lwe_array_out, lwe_array_in,
mem_ptr, bsks, ksks, num_radix_blocks);
}
template <typename Torus>
@@ -732,14 +703,12 @@ __host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
is_at_least_one_comparisons_block_true<Torus>(
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks);
is_at_least_one_comparisons_block_true<Torus>(streams, lwe_array_out,
lwe_array_in, mem_ptr, bsks,
ksks, num_radix_blocks);
}
#endif

View File

@@ -344,7 +344,7 @@ host_integer_decompress(CudaStreams streams,
execute_pbs_async<Torus, Torus>(
active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
lut->lwe_indexes_in, d_bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
@@ -378,7 +378,7 @@ host_integer_decompress(CudaStreams streams,
execute_pbs_async<Torus, Torus>(
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, d_bsks, nullptr, lut->buffer,
lwe_trivial_indexes_vec, d_bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,

View File

@@ -24,14 +24,13 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, void *const *ksks) {
PUSH_RANGE("div")
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
host_integer_div_rem_kb<uint64_t>(
CudaStreams(streams), quotient, remainder, numerator, divisor, is_signed,
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem);
host_integer_div_rem_kb<uint64_t>(CudaStreams(streams), quotient, remainder,
numerator, divisor, is_signed, bsks,
(uint64_t **)(ksks), mem);
POP_RANGE()
}

View File

@@ -37,9 +37,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
uint64_t *const *ksks, unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
if (streams.count() < 4) {
PANIC("GPU count should be greater than 4 when using div_rem_2_2");
@@ -61,31 +59,6 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
auto divisor_gpu_1 = mem_ptr->divisor_gpu_1;
auto divisor_gpu_2 = mem_ptr->divisor_gpu_2;
auto make_view = [](CudaModulusSwitchNoiseReductionKeyFFI const *src,
size_t i) {
CudaModulusSwitchNoiseReductionKeyFFI v;
v.ptr = (src == nullptr) ? nullptr
: (src->ptr == nullptr) ? nullptr
: src->ptr + i;
v.num_zeros = (src == nullptr) ? 0 : src->num_zeros;
v.ms_bound = (src == nullptr) ? 0. : src->ms_bound;
v.ms_r_sigma = (src == nullptr) ? 0. : src->ms_r_sigma;
v.ms_input_variance = (src == nullptr) ? 0. : src->ms_input_variance;
return v;
};
CudaModulusSwitchNoiseReductionKeyFFI nrk0 =
make_view(ms_noise_reduction_key, 0);
CudaModulusSwitchNoiseReductionKeyFFI nrk1 =
make_view(ms_noise_reduction_key, 1);
CudaModulusSwitchNoiseReductionKeyFFI nrk2 =
make_view(ms_noise_reduction_key, 2);
CudaModulusSwitchNoiseReductionKeyFFI nrk3 =
make_view(ms_noise_reduction_key, 3);
CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_keys[4] = {
&nrk0, &nrk1, &nrk2, &nrk3};
// gpu[0] -> gpu[0]
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_gpu_0, numerator);
@@ -121,7 +94,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
mem_ptr->d2, divisor_gpu_1, streams.get_ith(1));
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams.get_ith(1), mem_ptr->d2, 1, mem_ptr->shift_mem, &bsks[1],
&ksks[1], ms_noise_reduction_keys[1], mem_ptr->d2->num_radix_blocks);
&ksks[1], mem_ptr->d2->num_radix_blocks);
// Computes 3*d = 4*d - d using block shift and subtraction on gpu[0]
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
@@ -131,10 +104,10 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
mem_ptr->tmp_gpu_0->num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->d3, 0, 1);
host_sub_and_propagate_single_carry(
streams.get_ith(0), mem_ptr->d3, mem_ptr->tmp_gpu_0, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, &bsks[0], &ksks[0],
ms_noise_reduction_keys[0], outputFlag::FLAG_NONE, 0);
host_sub_and_propagate_single_carry(streams.get_ith(0), mem_ptr->d3,
mem_ptr->tmp_gpu_0, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, &bsks[0],
&ksks[0], outputFlag::FLAG_NONE, 0);
// +-----------------+-----------------+-----------------+-----------------+
// | GPU[0] | GPU[1] | GPU[2] | GPU[3] |
@@ -194,8 +167,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
host_integer_overflowing_sub<uint64_t>(
streams.get_ith(gpu_index), sub_result, rem, low, sub_overflowed,
(const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem,
&bsks[gpu_index], &ksks[gpu_index],
ms_noise_reduction_keys[gpu_index], compute_overflow,
&bsks[gpu_index], &ksks[gpu_index], compute_overflow,
uses_input_borrow);
};
@@ -218,12 +190,10 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
host_compare_blocks_with_zero<Torus>(
streams.get_ith(gpu_index), comparison_blocks, d_msb,
comparison_buffer, &bsks[gpu_index], &ksks[gpu_index],
ms_noise_reduction_keys[gpu_index], d_msb->num_radix_blocks,
comparison_buffer->is_zero_lut);
d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
are_all_comparisons_block_true(
streams.get_ith(gpu_index), out_boolean_block, comparison_blocks,
comparison_buffer, &bsks[gpu_index], &ksks[gpu_index],
ms_noise_reduction_keys[gpu_index],
comparison_blocks->num_radix_blocks);
host_negation<Torus>(
@@ -288,16 +258,13 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
// used as a bitor
host_integer_radix_bitop_kb(streams.get_ith(0), o3, o3, mem_ptr->cmp_1,
mem_ptr->bitor_mem_1, &bsks[0], &ksks[0],
ms_noise_reduction_keys[0]);
mem_ptr->bitor_mem_1, &bsks[0], &ksks[0]);
// used as a bitor
host_integer_radix_bitop_kb(streams.get_ith(1), o2, o2, mem_ptr->cmp_2,
mem_ptr->bitor_mem_2, &bsks[1], &ksks[1],
ms_noise_reduction_keys[1]);
mem_ptr->bitor_mem_2, &bsks[1], &ksks[1]);
// used as a bitor
host_integer_radix_bitop_kb(streams.get_ith(2), o1, o1, mem_ptr->cmp_3,
mem_ptr->bitor_mem_3, &bsks[2], &ksks[2],
ms_noise_reduction_keys[2]);
mem_ptr->bitor_mem_3, &bsks[2], &ksks[2]);
// cmp_1, cmp_2, cmp_3 are not needed anymore, we can reuse them as c3,
// c2, c1. c0 is allocated on gpu[3], we take it from mem_ptr.
@@ -379,8 +346,7 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
rx, rx, cx, 4, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams.get_ith(gpu_index), rx, rx, &bsks[gpu_index],
&ksks[gpu_index], ms_noise_reduction_keys[gpu_index], lut,
rx->num_radix_blocks);
&ksks[gpu_index], lut, rx->num_radix_blocks);
};
for (uint j = 0; j < 4; j++) {
@@ -398,15 +364,15 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
// calculate quotient bits GPU[2]
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1.get_ith(2), mem_ptr->q1, c1, &bsks[2], &ksks[2],
ms_noise_reduction_keys[2], mem_ptr->quotient_lut_1, 1);
mem_ptr->quotient_lut_1, 1);
// calculate quotient bits GPU[1]
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1.get_ith(1), mem_ptr->q2, c2, &bsks[1], &ksks[1],
ms_noise_reduction_keys[1], mem_ptr->quotient_lut_2, 1);
mem_ptr->quotient_lut_2, 1);
// calculate quotient bits GPU[0]
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1.get_ith(0), mem_ptr->q3, c3, &bsks[0], &ksks[0],
ms_noise_reduction_keys[0], mem_ptr->quotient_lut_3, 1);
mem_ptr->quotient_lut_3, 1);
for (uint j = 0; j < 4; j++) {
cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
@@ -469,11 +435,11 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
streams.synchronize();
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, rem_gpu_0, rem_gpu_0, bsks, ksks, ms_noise_reduction_key,
streams, rem_gpu_0, rem_gpu_0, bsks, ksks,
mem_ptr->message_extract_lut_1, rem_gpu_0->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, q3_gpu_0, q3_gpu_0, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, 1);
mem_ptr->message_extract_lut_2, 1);
streams.synchronize();
mem_ptr->sub_streams_1.synchronize();
@@ -514,9 +480,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
@@ -531,7 +495,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
streams, quotient, remainder, numerator, divisor, bsks, ksks,
ms_noise_reduction_key, mem_ptr->div_rem_2_2_mem);
mem_ptr->div_rem_2_2_mem);
return;
}
auto radix_params = mem_ptr->params;
@@ -632,7 +596,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
interesting_divisor->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, &last_interesting_divisor_block,
&last_interesting_divisor_block, bsks, ksks, ms_noise_reduction_key,
&last_interesting_divisor_block, bsks, ksks,
mem_ptr->masking_luts_1[shifted_mask], 1);
}; // trim_last_interesting_divisor_bits
@@ -659,7 +623,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
ms_noise_reduction_key, mem_ptr->masking_luts_2[shifted_mask], 1);
mem_ptr->masking_luts_2[shifted_mask], 1);
}; // trim_first_divisor_ms_bits
// This does
@@ -681,7 +645,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
ms_noise_reduction_key, interesting_remainder1->num_radix_blocks);
interesting_remainder1->num_radix_blocks);
reset_radix_ciphertext_blocks(mem_ptr->tmp_radix,
interesting_remainder1->num_radix_blocks);
@@ -710,7 +674,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
ms_noise_reduction_key, interesting_remainder2->num_radix_blocks);
interesting_remainder2->num_radix_blocks);
}; // left_shift_interesting_remainder2
streams.synchronize();
@@ -783,8 +747,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
streams, new_remainder, merged_interesting_remainder,
interesting_divisor, subtraction_overflowed,
(const CudaRadixCiphertextFFI *)nullptr, mem_ptr->overflow_sub_mem,
bsks, ksks, ms_noise_reduction_key, compute_borrow,
uses_input_borrow);
bsks, ksks, compute_borrow, uses_input_borrow);
};
// fills:
@@ -802,13 +765,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
// So we can skip some stuff
host_compare_blocks_with_zero<Torus>(
streams, mem_ptr->tmp_1, trivial_blocks, mem_ptr->comparison_buffer,
bsks, ksks, ms_noise_reduction_key,
trivial_blocks->num_radix_blocks,
bsks, ksks, trivial_blocks->num_radix_blocks,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
is_at_least_one_comparisons_block_true<Torus>(
streams, at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
mem_ptr->comparison_buffer, bsks, ksks,
mem_ptr->tmp_1->num_radix_blocks);
}
};
@@ -821,7 +783,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, cleaned_merged_interesting_remainder,
cleaned_merged_interesting_remainder, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
mem_ptr->message_extract_lut_1,
cleaned_merged_interesting_remainder->num_radix_blocks);
};
@@ -859,8 +821,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, cleaned_merged_interesting_remainder,
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
ksks, ms_noise_reduction_key,
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
cleaned_merged_interesting_remainder->num_radix_blocks, factor);
};
@@ -868,8 +829,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
[&](CudaStreams streams) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
ksks, ms_noise_reduction_key,
mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
ksks, mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
new_remainder->num_radix_blocks, factor);
};
@@ -878,7 +838,6 @@ __host__ void host_unsigned_integer_div_rem_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, mem_ptr->did_not_overflow, subtraction_overflowed,
at_least_one_upper_block_is_non_zero, bsks, ksks,
ms_noise_reduction_key,
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
mem_ptr->merge_overflow_flags_luts[pos_in_block]
->params.message_modulus);
@@ -937,10 +896,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_1, num_blocks);
mem_ptr->message_extract_lut_1, num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
mem_ptr->message_extract_lut_2, num_blocks);
mem_ptr->sub_streams_1.synchronize();
mem_ptr->sub_streams_2.synchronize();
@@ -951,9 +910,7 @@ __host__ void host_integer_div_rem_kb(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_div_rem_memory<uint64_t> *int_mem_ptr) {
uint64_t *const *ksks, int_div_rem_memory<uint64_t> *int_mem_ptr) {
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
remainder->num_radix_blocks != quotient->num_radix_blocks)
@@ -978,19 +935,16 @@ __host__ void host_integer_div_rem_kb(
streams.synchronize();
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->abs_mem_1, true);
bsks, ksks, int_mem_ptr->abs_mem_1, true);
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->abs_mem_2, true);
bsks, ksks, int_mem_ptr->abs_mem_2, true);
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
host_unsigned_integer_div_rem_kb<Torus>(
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
positive_divisor, bsks, ksks, ms_noise_reduction_key,
int_mem_ptr->unsigned_mem);
positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
CudaRadixCiphertextFFI numerator_sign;
as_radix_ciphertext_slice<Torus>(&numerator_sign, numerator, num_blocks - 1,
@@ -1000,7 +954,7 @@ __host__ void host_integer_div_rem_kb(
num_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
&numerator_sign, &divisor_sign, bsks, ksks, ms_noise_reduction_key,
&numerator_sign, &divisor_sign, bsks, ksks,
int_mem_ptr->compare_signed_bits_lut, 1,
int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
@@ -1013,37 +967,36 @@ __host__ void host_integer_div_rem_kb(
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, nullptr,
nullptr, int_mem_ptr->scp_mem_1, bsks, ksks, ms_noise_reduction_key,
requested_flag, uses_carry);
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1,
int_mem_ptr->negated_quotient, nullptr,
nullptr, int_mem_ptr->scp_mem_1, bsks,
ksks, requested_flag, uses_carry);
host_integer_radix_negation<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
host_propagate_single_carry<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, nullptr,
nullptr, int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
requested_flag, uses_carry);
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_2,
int_mem_ptr->negated_remainder, nullptr,
nullptr, int_mem_ptr->scp_mem_2, bsks,
ksks, requested_flag, uses_carry);
host_integer_radix_cmux_kb<Torus>(int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different,
int_mem_ptr->negated_quotient, quotient,
int_mem_ptr->cmux_quotient_mem, bsks,
ksks, ms_noise_reduction_key);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
host_integer_radix_cmux_kb<Torus>(
int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
int_mem_ptr->negated_remainder, remainder,
int_mem_ptr->cmux_remainder_mem, bsks, ksks, ms_noise_reduction_key);
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
} else {
host_unsigned_integer_div_rem_kb<Torus>(
streams, quotient, remainder, numerator, divisor, bsks, ksks,
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
host_unsigned_integer_div_rem_kb<Torus>(streams, quotient, remainder,
numerator, divisor, bsks, ksks,
int_mem_ptr->unsigned_mem);
}
}

View File

@@ -29,13 +29,12 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
void cuda_integer_count_of_consecutive_bits_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
void *const *ksks) {
host_integer_count_of_consecutive_bits<uint64_t>(
CudaStreams(streams), output_ct, input_ct,
(int_count_of_consecutive_bits_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
(uint64_t **)ksks);
}
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
@@ -81,13 +80,12 @@ void cuda_integer_ilog2_kb_64(
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
void *const *bsks, void *const *ksks) {
host_integer_ilog2<uint64_t>(
CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
trivial_ct_m_minus_1_block, (int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
(uint64_t **)ksks);
}
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,

View File

@@ -9,14 +9,12 @@ template <typename Torus>
__host__ void host_integer_prepare_count_of_consecutive_bits(
CudaStreams streams, CudaRadixCiphertextFFI *ciphertext,
int_prepare_count_of_consecutive_bits_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, Torus *const *ksks) {
auto tmp = mem_ptr->tmp_ct;
host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
mem_ptr->univ_lut_mem, ksks,
ms_noise_reduction_key, bsks);
mem_ptr->univ_lut_mem, ksks, bsks);
if (mem_ptr->direction == Leading) {
host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
@@ -24,7 +22,7 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(
host_compute_prefix_sum_hillis_steele<uint64_t>(
streams, ciphertext, tmp, mem_ptr->biv_lut_mem, bsks, ksks,
ms_noise_reduction_key, ciphertext->num_radix_blocks);
ciphertext->num_radix_blocks);
}
template <typename Torus>
@@ -48,8 +46,7 @@ __host__ void host_integer_count_of_consecutive_bits(
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
int_count_of_consecutive_bits_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
Torus *const *ksks) {
auto params = mem_ptr->params;
auto ct_prepared = mem_ptr->ct_prepared;
@@ -60,9 +57,8 @@ __host__ void host_integer_count_of_consecutive_bits(
// Prepare count of consecutive bits
//
host_integer_prepare_count_of_consecutive_bits(streams, ct_prepared,
mem_ptr->prepare_mem, bsks,
ksks, ms_noise_reduction_key);
host_integer_prepare_count_of_consecutive_bits(
streams, ct_prepared, mem_ptr->prepare_mem, bsks, ksks);
// Perform addition and propagation of prepared cts
//
@@ -76,12 +72,11 @@ __host__ void host_integer_count_of_consecutive_bits(
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, output_ct, cts, bsks, ksks, ms_noise_reduction_key,
mem_ptr->sum_mem, counter_num_blocks, ct_prepared->num_radix_blocks);
streams, output_ct, cts, bsks, ksks, mem_ptr->sum_mem, counter_num_blocks,
ct_prepared->num_radix_blocks);
host_propagate_single_carry<Torus>(streams, output_ct, nullptr, nullptr,
mem_ptr->propagate_mem, bsks, ksks,
ms_noise_reduction_key, 0, 0);
mem_ptr->propagate_mem, bsks, ksks, 0, 0);
}
template <typename Torus>
@@ -103,14 +98,14 @@ __host__ uint64_t scratch_integer_ilog2(CudaStreams streams,
}
template <typename Torus>
__host__ void host_integer_ilog2(
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
__host__ void
host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
// Prepare the input ciphertext by computing the number of consecutive
// leading zeros for each of its blocks.
@@ -118,8 +113,7 @@ __host__ void host_integer_ilog2(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
mem_ptr->ct_in_buffer, input_ct);
host_integer_prepare_count_of_consecutive_bits<Torus>(
streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks,
ms_noise_reduction_key);
streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks);
// Build the input for the sum by taking each block's leading zero count
// and placing it into a separate, zero-padded ct slot.
@@ -148,17 +142,17 @@ __host__ void host_integer_ilog2(
//
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
mem_ptr->counter_num_blocks, mem_ptr->input_num_blocks + 1);
ksks, mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
mem_ptr->input_num_blocks + 1);
// Apply luts to the partial sum.
//
host_apply_univariate_lut_kb<Torus>(
streams, mem_ptr->message_blocks_not, mem_ptr->sum_output_not_propagated,
mem_ptr->lut_message_not, ksks, ms_noise_reduction_key, bsks);
host_apply_univariate_lut_kb<Torus>(
streams, mem_ptr->carry_blocks_not, mem_ptr->sum_output_not_propagated,
mem_ptr->lut_carry_not, ksks, ms_noise_reduction_key, bsks);
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->message_blocks_not,
mem_ptr->sum_output_not_propagated,
mem_ptr->lut_message_not, ksks, bsks);
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->carry_blocks_not,
mem_ptr->sum_output_not_propagated,
mem_ptr->lut_carry_not, ksks, bsks);
// Left-shift the bitwise-negated carry blocks by one position.
//
@@ -196,12 +190,12 @@ __host__ void host_integer_ilog2(
trivial_ct_2, 0, mem_ptr->counter_num_blocks);
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks,
ms_noise_reduction_key, mem_ptr->sum_mem, mem_ptr->counter_num_blocks, 3);
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks, mem_ptr->sum_mem,
mem_ptr->counter_num_blocks, 3);
host_full_propagate_inplace<Torus>(
streams, output_ct, mem_ptr->final_propagate_mem, ksks,
ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
host_full_propagate_inplace<Torus>(streams, output_ct,
mem_ptr->final_propagate_mem, ksks, bsks,
mem_ptr->counter_num_blocks);
}
#endif

View File

@@ -2,18 +2,17 @@
#include "integer/negation.cuh"
#include <linear_algebra.h>
void cuda_full_propagation_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_blocks) {
void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *input_blocks,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {
int_fullprop_buffer<uint64_t> *buffer =
(int_fullprop_buffer<uint64_t> *)mem_ptr;
host_full_propagate_inplace<uint64_t>(
CudaStreams(streams), input_blocks, buffer, (uint64_t **)(ksks),
ms_noise_reduction_key, bsks, num_blocks);
host_full_propagate_inplace<uint64_t>(CudaStreams(streams), input_blocks,
buffer, (uint64_t **)(ksks), bsks,
num_blocks);
}
uint64_t scratch_cuda_full_propagation_64(
@@ -103,27 +102,24 @@ void cuda_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_propagate_single_carry<uint64_t>(
CudaStreams(streams), lwe_array, carry_out, carry_in,
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, requested_flag, uses_carry);
requested_flag, uses_carry);
}
void cuda_add_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
host_add_and_propagate_single_carry<uint64_t>(
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, requested_flag, uses_carry);
requested_flag, uses_carry);
}
void cuda_integer_overflowing_sub_kb_64_inplace(
@@ -131,15 +127,13 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t compute_overflow, uint32_t uses_input_borrow) {
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
uint32_t uses_input_borrow) {
PUSH_RANGE("overflow sub")
host_integer_overflowing_sub<uint64_t>(
CudaStreams(streams), lhs_array, lhs_array, rhs_array, overflow_block,
input_borrow, (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, compute_overflow,
uses_input_borrow);
(uint64_t **)ksks, compute_overflow, uses_input_borrow);
POP_RANGE()
}
@@ -218,14 +212,11 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
void cuda_apply_univariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
void *const *ksks, void *const *bsks) {
host_apply_univariate_lut_kb<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
ms_noise_reduction_key, bsks);
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
}
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
@@ -241,14 +232,13 @@ void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
void cuda_apply_many_univariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
uint32_t lut_stride) {
host_apply_many_univariate_lut_kb<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
ms_noise_reduction_key, bsks, num_many_lut, lut_stride);
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_many_lut, lut_stride);
}
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
@@ -275,15 +265,13 @@ void cuda_apply_bivariate_lut_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe_1,
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift) {
host_apply_bivariate_lut_kb<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_radix_blocks,
shift);
(uint64_t **)(ksks), bsks, num_radix_blocks, shift);
}
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
@@ -320,14 +308,12 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_radix_blocks) {
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
host_compute_prefix_sum_hillis_steele<uint64_t>(
CudaStreams(streams), output_radix_lwe, generates_or_propagates,
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks);
num_radix_blocks);
}
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
@@ -399,15 +385,12 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
void cuda_apply_noise_squashing_kb(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
void *const *ksks, void *const *bsks) {
PUSH_RANGE("apply noise squashing")
integer_radix_apply_noise_squashing_kb<uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key);
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
POP_RANGE()
}

View File

@@ -507,9 +507,7 @@ template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
PUSH_RANGE("apply lut")
// apply_lookup_table
auto params = lut->params;
@@ -557,10 +555,9 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
execute_pbs_async<Torus, Torus>(
streams.get_ith(0), (Torus *)lwe_array_out->ptr, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, ms_noise_reduction_key, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
lut_stride);
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_event_record(lut->event_scatter_in, streams.stream(0),
@@ -590,10 +587,9 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
execute_pbs_async<Torus, Torus>(
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
lut_stride);
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
PUSH_RANGE("gather")
@@ -627,9 +623,8 @@ template <typename Torus>
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
uint32_t lut_stride) {
PUSH_RANGE("apply many lut")
// apply_lookup_table
auto params = lut->params;
@@ -674,10 +669,9 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
execute_pbs_async<Torus, Torus>(
streams.get_ith(0), (Torus *)lwe_array_out->ptr, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, ms_noise_reduction_key, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
lut_stride);
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_event_record(lut->event_scatter_in, streams.stream(0),
@@ -706,10 +700,9 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
execute_pbs_async<Torus, Torus>(
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
lut_stride);
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
PUSH_RANGE("gather")
@@ -745,9 +738,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks,
uint32_t shift) {
PUSH_RANGE("apply bivar lut")
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -806,10 +798,9 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
execute_pbs_async<Torus, Torus>(
streams.get_ith(0), (Torus *)(lwe_array_out->ptr), lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, ms_noise_reduction_key, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
lut_stride);
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
cuda_event_record(lut->event_scatter_in, streams.stream(0),
streams.gpu_index(0));
@@ -835,10 +826,9 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
execute_pbs_async<Torus, Torus>(
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
lut_stride);
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
PUSH_RANGE("gather")
@@ -1317,9 +1307,7 @@ template <typename Torus>
void host_compute_shifted_blocks_and_states(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t lut_stride, uint32_t num_many_lut) {
Torus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
@@ -1328,7 +1316,7 @@ void host_compute_shifted_blocks_and_states(
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, shifted_blocks_and_states, lwe_array, bsks, ksks,
ms_noise_reduction_key, luts_array_first_step, num_many_lut, lut_stride);
luts_array_first_step, num_many_lut, lut_stride);
auto shifted_blocks = mem->shifted_blocks;
auto block_states = mem->block_states;
@@ -1347,9 +1335,7 @@ void host_resolve_group_carries_sequentially(
CudaStreams streams, CudaRadixCiphertextFFI *resolved_carries,
CudaRadixCiphertextFFI *grouping_pgns, int_radix_params params,
int_seq_group_prop_memory<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_groups) {
Torus *const *ksks, uint32_t num_groups) {
auto group_resolved_carries = mem->group_resolved_carries;
if (num_groups > 1) {
@@ -1398,8 +1384,8 @@ void host_resolve_group_carries_sequentially(
blocks_to_solve + 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, &shifted_group_resolved_carries,
&shifted_group_resolved_carries, bsks, ksks, ms_noise_reduction_key,
luts_sequential, blocks_to_solve);
&shifted_group_resolved_carries, bsks, ksks, luts_sequential,
blocks_to_solve);
// Copy the result to the resolved carries array
copy_radix_ciphertext_slice_async<Torus>(
@@ -1416,9 +1402,7 @@ template <typename Torus>
void host_compute_prefix_sum_hillis_steele(
CudaStreams streams, CudaRadixCiphertextFFI *step_output,
CudaRadixCiphertextFFI *generates_or_propagates, int_radix_lut<Torus> *luts,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
if (step_output->lwe_dimension != generates_or_propagates->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -1440,9 +1424,8 @@ void host_compute_prefix_sum_hillis_steele(
int cur_total_blocks = num_radix_blocks - space;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks,
ms_noise_reduction_key, luts, cur_total_blocks,
luts->params.message_modulus);
streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks, luts,
cur_total_blocks, luts->params.message_modulus);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), generates_or_propagates, space,
@@ -1462,9 +1445,8 @@ template <typename Torus>
void host_compute_propagation_simulators_and_group_carries(
CudaStreams streams, CudaRadixCiphertextFFI *block_states,
int_radix_params params, int_prop_simu_group_carries_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks, uint32_t num_groups) {
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_groups) {
if (num_radix_blocks > block_states->num_radix_blocks)
PANIC("Cuda error: input does not have enough radix blocks")
@@ -1481,7 +1463,7 @@ void host_compute_propagation_simulators_and_group_carries(
auto luts_array_second_step = mem->luts_array_second_step;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, propagation_cum_sums, propagation_cum_sums, bsks, ksks,
ms_noise_reduction_key, luts_array_second_step, num_radix_blocks);
luts_array_second_step, num_radix_blocks);
host_integer_radix_scalar_addition_inplace<Torus>(
streams, propagation_cum_sums, mem->scalar_array_cum_sum,
@@ -1500,10 +1482,9 @@ void host_compute_propagation_simulators_and_group_carries(
auto resolved_carries = mem->resolved_carries;
if (mem->use_sequential_algorithm_to_resolve_group_carries) {
// Resolve group carries sequentially
host_resolve_group_carries_sequentially(streams, resolved_carries,
grouping_pgns, params,
mem->seq_group_prop_mem, bsks, ksks,
ms_noise_reduction_key, num_groups);
host_resolve_group_carries_sequentially(
streams, resolved_carries, grouping_pgns, params,
mem->seq_group_prop_mem, bsks, ksks, num_groups);
} else {
// Resolve group carries with hillis steele
auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
@@ -1512,8 +1493,7 @@ void host_compute_propagation_simulators_and_group_carries(
resolved_carries, 1, num_groups);
host_compute_prefix_sum_hillis_steele<Torus>(
streams, &shifted_resolved_carries, grouping_pgns,
luts_carry_propagation_sum, bsks, ksks, ms_noise_reduction_key,
num_groups - 1);
luts_carry_propagation_sum, bsks, ksks, num_groups - 1);
}
}
@@ -1527,9 +1507,7 @@ template <typename Torus>
void host_compute_shifted_blocks_and_borrow_states(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t lut_stride, uint32_t num_many_lut) {
Torus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
@@ -1537,7 +1515,7 @@ void host_compute_shifted_blocks_and_borrow_states(
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, shifted_blocks_and_borrow_states, lwe_array, bsks, ksks,
ms_noise_reduction_key, luts_array_first_step, num_many_lut, lut_stride);
luts_array_first_step, num_many_lut, lut_stride);
auto shifted_blocks = mem->shifted_blocks;
auto borrow_states = mem->borrow_states;
@@ -1559,11 +1537,11 @@ void host_compute_shifted_blocks_and_borrow_states(
* have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
*/
template <typename Torus>
void host_full_propagate_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_blocks) {
void host_full_propagate_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr,
Torus *const *ksks, void *const *bsks,
uint32_t num_blocks) {
auto params = mem_ptr->lut->params;
// In the case of extracting a single LWE this parameters are dummy
@@ -1589,8 +1567,8 @@ void host_full_propagate_inplace(
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
mem_ptr->lut->lut_indexes_vec,
(Torus *)mem_ptr->tmp_small_lwe_vector->ptr,
mem_ptr->lut->lwe_trivial_indexes, bsks, ms_noise_reduction_key,
mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension,
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);
@@ -1721,13 +1699,12 @@ __host__ void scalar_pack_blocks(cudaStream_t stream, uint32_t gpu_index,
* * (lwe_dimension+1) * sizeeof(Torus) bytes
*/
template <typename Torus>
__host__ void extract_n_bits(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t effective_num_radix_blocks, uint32_t num_radix_blocks,
int_bit_extract_luts_buffer<Torus> *bit_extract) {
__host__ void
extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
Torus *const *ksks, uint32_t effective_num_radix_blocks,
uint32_t num_radix_blocks,
int_bit_extract_luts_buffer<Torus> *bit_extract) {
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array_out, 0,
@@ -1741,19 +1718,17 @@ __host__ void extract_n_bits(
}
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
bit_extract->lut, effective_num_radix_blocks);
streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
effective_num_radix_blocks);
}
template <typename Torus>
__host__ void reduce_signs(
CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
CudaRadixCiphertextFFI *signs_array_in,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_sign_blocks) {
__host__ void
reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
CudaRadixCiphertextFFI *signs_array_in,
int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks, uint32_t num_sign_blocks) {
if (signs_array_out->lwe_dimension != signs_array_in->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -1799,8 +1774,7 @@ __host__ void reduce_signs(
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
signs_a, num_sign_blocks, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, signs_a, signs_b, bsks, ksks, ms_noise_reduction_key, lut,
num_sign_blocks / 2);
streams, signs_a, signs_b, bsks, ksks, lut, num_sign_blocks / 2);
if (num_sign_blocks % 2 == 1)
copy_radix_ciphertext_slice_async<Torus>(
@@ -1830,8 +1804,7 @@ __host__ void reduce_signs(
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
signs_a, num_sign_blocks, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, signs_array_out, signs_b, bsks, ksks, ms_noise_reduction_key,
lut, 1);
streams, signs_array_out, signs_b, bsks, ksks, lut, 1);
} else {
@@ -1849,8 +1822,7 @@ __host__ void reduce_signs(
lut->broadcast_lut(lut->active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, signs_array_out, signs_a, bsks, ksks, ms_noise_reduction_key,
lut, 1);
streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
}
}
@@ -1877,16 +1849,15 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
}
template <typename Torus>
void host_apply_univariate_lut_kb(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
void host_apply_univariate_lut_kb(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, ms_noise_reduction_key,
mem, radix_lwe_out->num_radix_blocks);
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
radix_lwe_out->num_radix_blocks);
}
template <typename Torus>
@@ -1916,13 +1887,12 @@ template <typename Torus>
void host_apply_many_univariate_lut_kb(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
Torus *const *ksks, void *const *bsks, uint32_t num_many_lut,
uint32_t lut_stride) {
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, ms_noise_reduction_key,
mem, num_many_lut, lut_stride);
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem, num_many_lut,
lut_stride);
}
template <typename Torus>
@@ -1948,17 +1918,17 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
}
template <typename Torus>
void host_apply_bivariate_lut_kb(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in_1,
CudaRadixCiphertextFFI const *radix_lwe_in_2, int_radix_lut<Torus> *mem,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
void host_apply_bivariate_lut_kb(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in_1,
CudaRadixCiphertextFFI const *radix_lwe_in_2,
int_radix_lut<Torus> *mem, Torus *const *ksks,
void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks,
ms_noise_reduction_key, mem, num_radix_blocks, shift);
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
num_radix_blocks, shift);
}
template <typename Torus>
@@ -1977,13 +1947,13 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
// This function perform the three steps of Thomas' new carry propagation
// includes the logic to extract overflow when requested
template <typename Torus>
void host_propagate_single_carry(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
void host_propagate_single_carry(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries,
int_sc_prop_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks,
uint32_t requested_flag, uint32_t uses_carry) {
PUSH_RANGE("propagate sc")
auto num_radix_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
@@ -2006,8 +1976,8 @@ void host_propagate_single_carry(
// Step 1
host_compute_shifted_blocks_and_states<Torus>(
streams, lwe_array, mem->shifted_blocks_state_mem, bsks, ksks,
ms_noise_reduction_key, lut_stride, num_many_lut);
streams, lwe_array, mem->shifted_blocks_state_mem, bsks, ksks, lut_stride,
num_many_lut);
auto block_states = mem->shifted_blocks_state_mem->block_states;
if (requested_flag == outputFlag::FLAG_CARRY) {
@@ -2018,7 +1988,7 @@ void host_propagate_single_carry(
// Step 2
host_compute_propagation_simulators_and_group_carries<Torus>(
streams, block_states, params, mem->prop_simu_group_carries_mem, bsks,
ksks, ms_noise_reduction_key, num_radix_blocks, mem->num_groups);
ksks, num_radix_blocks, mem->num_groups);
auto group_size = mem->prop_simu_group_carries_mem->group_size;
@@ -2059,7 +2029,7 @@ void host_propagate_single_carry(
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, mem->output_flag, prepared_blocks, bsks, ksks,
ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks + 1);
mem->lut_message_extract, num_radix_blocks + 1);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_radix_blocks,
@@ -2070,8 +2040,8 @@ void host_propagate_single_carry(
} else {
auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
message_extract, num_radix_blocks);
streams, lwe_array, prepared_blocks, bsks, ksks, message_extract,
num_radix_blocks);
}
POP_RANGE()
}
@@ -2083,9 +2053,8 @@ void host_add_and_propagate_single_carry(
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
void *const *bsks, Torus *const *ksks, uint32_t requested_flag,
uint32_t uses_carry) {
PUSH_RANGE("add & propagate sc")
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -2138,15 +2107,14 @@ void host_add_and_propagate_single_carry(
}
// Step 1
host_compute_shifted_blocks_and_states<Torus>(
streams, lhs_array, mem->shifted_blocks_state_mem, bsks, ksks,
ms_noise_reduction_key, lut_stride, num_many_lut);
streams, lhs_array, mem->shifted_blocks_state_mem, bsks, ksks, lut_stride,
num_many_lut);
auto block_states = mem->shifted_blocks_state_mem->block_states;
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, &output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks,
ms_noise_reduction_key, lut_overflow_prep, 1,
lut_overflow_prep->params.message_modulus);
lut_overflow_prep, 1, lut_overflow_prep->params.message_modulus);
} else if (requested_flag == outputFlag::FLAG_CARRY) {
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), &output_flag, 0, 1,
@@ -2156,7 +2124,7 @@ void host_add_and_propagate_single_carry(
// Step 2
host_compute_propagation_simulators_and_group_carries<Torus>(
streams, block_states, params, mem->prop_simu_group_carries_mem, bsks,
ksks, ms_noise_reduction_key, num_radix_blocks, mem->num_groups);
ksks, num_radix_blocks, mem->num_groups);
auto group_size = mem->prop_simu_group_carries_mem->group_size;
@@ -2209,7 +2177,7 @@ void host_add_and_propagate_single_carry(
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, mem->output_flag, prepared_blocks, bsks, ksks,
ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks + 1);
mem->lut_message_extract, num_radix_blocks + 1);
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lhs_array, 0, num_radix_blocks,
@@ -2220,7 +2188,7 @@ void host_add_and_propagate_single_carry(
mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lhs_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
streams, lhs_array, prepared_blocks, bsks, ksks,
mem->lut_message_extract, num_radix_blocks);
}
POP_RANGE()
@@ -2243,14 +2211,15 @@ uint64_t scratch_cuda_integer_overflowing_sub(
// This function perform the three steps of Thomas' new borrow propagation
// includes the logic to extract overflow when requested
template <typename Torus>
void host_single_borrow_propagate(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow,
int_borrow_prop_memory<Torus> *mem, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_groups, uint32_t compute_overflow,
uint32_t uses_input_borrow) {
void host_single_borrow_propagate(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow,
int_borrow_prop_memory<Torus> *mem,
void *const *bsks, Torus *const *ksks,
uint32_t num_groups,
uint32_t compute_overflow,
uint32_t uses_input_borrow) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
@@ -2272,7 +2241,7 @@ void host_single_borrow_propagate(
// Step 1
host_compute_shifted_blocks_and_borrow_states<Torus>(
streams, lwe_array, mem->shifted_blocks_borrow_state_mem, bsks, ksks,
ms_noise_reduction_key, lut_stride, num_many_lut);
lut_stride, num_many_lut);
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
copy_radix_ciphertext_slice_async<Torus>(
@@ -2282,7 +2251,7 @@ void host_single_borrow_propagate(
// Step 2
host_compute_propagation_simulators_and_group_carries<Torus>(
streams, borrow_states, params, mem->prop_simu_group_carries_mem, bsks,
ksks, ms_noise_reduction_key, num_radix_blocks, num_groups);
ksks, num_radix_blocks, num_groups);
auto shifted_blocks =
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
@@ -2336,7 +2305,7 @@ void host_single_borrow_propagate(
auto borrow_flag = mem->lut_borrow_flag;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
ms_noise_reduction_key, borrow_flag, 1);
borrow_flag, 1);
}
for (int j = 0; j < mem->active_streams.count(); j++) {
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1.stream(j),
@@ -2358,7 +2327,7 @@ void host_single_borrow_propagate(
auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
ms_noise_reduction_key, message_extract, num_radix_blocks);
message_extract, num_radix_blocks);
for (int j = 0; j < mem->active_streams.count(); j++) {
cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2.stream(j),
@@ -2378,8 +2347,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
InputTorus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
InputTorus *const *ksks) {
PUSH_RANGE("apply noise squashing")
auto params = lut->params;
@@ -2431,11 +2399,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
execute_pbs_async<uint64_t, __uint128_t>(
streams.get_ith(0), (__uint128_t *)lwe_array_out->ptr,
lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
ms_noise_reduction_key, lut->pbs_buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
0);
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->pbs_buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
params.pbs_type, 0, 0);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
@@ -2459,10 +2426,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
execute_pbs_async<uint64_t, __uint128_t>(
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lwe_trivial_indexes_vec, lwe_after_ks_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
params.pbs_type, 0, 0);
lwe_trivial_indexes_vec, bsks, lut->pbs_buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
0);
/// Copy data back to GPU 0 and release vecs
/// In apply noise squashing we always use trivial indexes

View File

@@ -128,59 +128,51 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
uint32_t polynomial_size, uint32_t num_blocks) {
PUSH_RANGE("mul")
switch (polynomial_size) {
case 256:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 512:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 1024:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 2048:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 4096:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 8192:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
case 16384:
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
num_blocks);
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -225,8 +217,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *ksks) {
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
@@ -234,8 +225,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
"output's number of radix blocks")
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
(uint64_t **)(ksks), mem, radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
}

View File

@@ -291,7 +291,6 @@ template <typename Torus>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
@@ -407,8 +406,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
streams.get_ith(0), (Torus *)current_blocks->ptr, d_pbs_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
@@ -420,7 +419,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, current_blocks, current_blocks, bsks, ksks,
ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
luts_message_carry, total_ciphertexts);
}
cuda_set_device(streams.gpu_index(0));
std::swap(d_columns, d_new_columns);
@@ -458,8 +457,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
streams.get_ith(0), (Torus *)current_blocks->ptr, d_pbs_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
@@ -471,7 +470,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
active_streams, current_blocks, radix_lwe_out, bsks, ksks,
ms_noise_reduction_key, luts_message_carry, num_blocks_in_apply_lut);
luts_message_carry, num_blocks_in_apply_lut);
}
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
num_radix_blocks, num_radix_in_vec, chunk_size,
@@ -493,9 +492,8 @@ __host__ void host_integer_mult_radix_kb(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
void *const *bsks, uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
void *const *bsks, uint64_t *const *ksks, int_mul_memory<Torus> *mem_ptr,
uint32_t num_blocks) {
if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension ||
radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension)
@@ -513,14 +511,14 @@ __host__ void host_integer_mult_radix_kb(
if (is_bool_right) {
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_left, radix_lwe_right,
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
bsks, ksks, ms_noise_reduction_key, num_blocks);
bsks, ksks, num_blocks);
return;
}
if (is_bool_left) {
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_right, radix_lwe_left,
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
bsks, ksks, ms_noise_reduction_key, num_blocks);
bsks, ksks, num_blocks);
return;
}
@@ -589,8 +587,7 @@ __host__ void host_integer_mult_radix_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
ms_noise_reduction_key, luts_array, total_block_count,
luts_array->params.message_modulus);
luts_array, total_block_count, luts_array->params.message_modulus);
vector_result_lsb = block_mul_res;
as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
@@ -618,15 +615,14 @@ __host__ void host_integer_mult_radix_kb(
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
streams, radix_lwe_out, vector_result_sb, bsks, ksks,
ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks);
mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, radix_lwe_out, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
ms_noise_reduction_key, requested_flag, uses_carry);
host_propagate_single_carry<Torus>(streams, radix_lwe_out, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, requested_flag,
uses_carry);
}
template <typename Torus>

View File

@@ -134,9 +134,7 @@ __host__ void host_integer_overflowing_sub(
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow,
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t compute_overflow, uint32_t uses_input_borrow) {
Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
PUSH_RANGE("overflowing sub")
if (output->num_radix_blocks != input_left->num_radix_blocks ||
output->num_radix_blocks != input_right->num_radix_blocks)
@@ -166,7 +164,7 @@ __host__ void host_integer_overflowing_sub(
host_single_borrow_propagate<Torus>(
streams, output, overflow_block, input_borrow,
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
num_groups, compute_overflow, uses_input_borrow);
POP_RANGE()
}

View File

@@ -21,16 +21,15 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
allocate_gpu_memory);
}
void cuda_integer_grouped_oprf_async_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
void *const *bsks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const void *seeded_lwe_input,
uint32_t num_blocks_to_process,
int8_t *mem, void *const *bsks) {
host_integer_grouped_oprf<uint64_t>(
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks,
ms_noise_reduction_key);
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks);
}
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,

View File

@@ -20,11 +20,12 @@ uint64_t scratch_cuda_integer_grouped_oprf(
}
template <typename Torus>
void host_integer_grouped_oprf(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
const Torus *seeded_lwe_input, uint32_t num_blocks_to_process,
int_grouped_oprf_memory<Torus> *mem_ptr, void *const *bsks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void host_integer_grouped_oprf(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const Torus *seeded_lwe_input,
uint32_t num_blocks_to_process,
int_grouped_oprf_memory<Torus> *mem_ptr,
void *const *bsks) {
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
auto lut = mem_ptr->luts;
@@ -34,7 +35,7 @@ void host_integer_grouped_oprf(
streams.get_ith(0), (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec,
const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
ms_noise_reduction_key, lut->buffer, mem_ptr->params.glwe_dimension,
lut->buffer, mem_ptr->params.glwe_dimension,
mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, num_blocks_to_process,
@@ -62,7 +63,7 @@ void host_integer_grouped_oprf(
execute_pbs_async<Torus, Torus>(
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
lwe_trivial_indexes_vec, bsks, lut->buffer,
mem_ptr->params.glwe_dimension, mem_ptr->params.small_lwe_dimension,
mem_ptr->params.polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,

View File

@@ -4,15 +4,13 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, void *const *ksks) {
host_integer_radix_scalar_bitop_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_input,
static_cast<const uint64_t *>(clear_blocks),
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key);
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
}
void update_degrees_after_scalar_bitand(uint64_t *output_degrees,

View File

@@ -9,8 +9,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
if (output->num_radix_blocks != input->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be equal")
@@ -50,8 +49,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
lut->broadcast_lut(active_streams, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, output, input, bsks, ksks, ms_noise_reduction_key, lut,
num_clear_blocks);
streams, output, input, bsks, ksks, lut, num_clear_blocks);
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {

View File

@@ -35,9 +35,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_scalar_blocks) {
void *const *ksks, uint32_t num_scalar_blocks) {
// The output ciphertext might be a boolean block or a radix ciphertext
// depending on the case (eq/gt vs max/min) so the amount of blocks to
@@ -51,8 +49,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
host_integer_radix_scalar_equality_check_kb<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
num_scalar_blocks);
(uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
break;
case GT:
case GE:
@@ -66,7 +63,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
static_cast<const uint64_t *>(scalar_blocks),
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
num_radix_blocks, num_scalar_blocks);
break;
case MAX:
case MIN:
@@ -77,8 +74,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
CudaStreams(streams), lwe_array_out, lwe_array_in,
static_cast<const uint64_t *>(scalar_blocks),
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
num_scalar_blocks);
(uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
break;
default:
PANIC("Cuda error: integer operation not supported")

View File

@@ -29,9 +29,7 @@ __host__ void scalar_compare_radix_blocks_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
return;
@@ -71,8 +69,8 @@ __host__ void scalar_compare_radix_blocks_kb(
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, subtracted_blocks, bsks, ksks,
ms_noise_reduction_key, sign_lut, num_radix_blocks);
streams, lwe_array_out, subtracted_blocks, bsks, ksks, sign_lut,
num_radix_blocks);
// FIXME: without this sync signed scalar eq tests fail, I don't understand
// the reason
@@ -90,9 +88,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
@@ -132,11 +128,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// means scalar is zero
host_compare_blocks_with_zero<Torus>(
streams, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
num_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_lwe_array_out,
mem_ptr, bsks, ksks, ms_noise_reduction_key,
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
mem_ptr, bsks, ksks, mem_ptr->tmp_lwe_array_out->num_radix_blocks);
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -154,8 +149,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks,
ms_noise_reduction_key, lut, 1);
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
} else if (num_scalar_blocks < num_radix_blocks) {
// We have to handle both part of the work described above
@@ -207,7 +201,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb<Torus>(
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
@@ -215,15 +209,15 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
num_lsb_radix_blocks);
//////////////
// msb
host_compare_blocks_with_zero<Torus>(
msb_streams, &lwe_array_msb_out, &msb, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
num_msb_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
msb_streams, &lwe_array_msb_out, &lwe_array_msb_out, mem_ptr, bsks,
ksks, ms_noise_reduction_key, lwe_array_msb_out.num_radix_blocks);
ksks, lwe_array_msb_out.num_radix_blocks);
lsb_streams.synchronize();
msb_streams.synchronize();
@@ -250,7 +244,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
ksks, ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
ksks, lut, 1, lut->params.message_modulus);
} else {
if (num_radix_blocks == 1) {
@@ -283,8 +277,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_in, bsks, ksks,
ms_noise_reduction_key, one_block_lut, 1);
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
one_block_lut->release(streams);
delete one_block_lut;
} else {
@@ -314,7 +307,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
auto comparisons = mem_ptr->tmp_lwe_array_out;
scalar_compare_radix_blocks_kb<Torus>(
streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
@@ -322,7 +315,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
sign_handler_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
num_lsb_radix_blocks);
}
}
}
@@ -333,9 +326,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -376,10 +367,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
host_compare_blocks_with_zero<Torus>(
streams, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
num_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
streams, are_all_msb_zeros, are_all_msb_zeros, mem_ptr, bsks, ksks,
ms_noise_reduction_key, are_all_msb_zeros->num_radix_blocks);
are_all_msb_zeros->num_radix_blocks);
CudaRadixCiphertextFFI sign_block;
as_radix_ciphertext_slice<Torus>(&sign_block, lwe_array_in,
num_radix_blocks - 1, num_radix_blocks);
@@ -430,8 +421,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks,
ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
1, lut->params.message_modulus);
} else if (num_scalar_blocks < num_radix_blocks) {
// We have to handle both part of the work described above
@@ -477,7 +468,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb<Torus>(
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
@@ -485,17 +476,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
ms_noise_reduction_key, num_lsb_radix_blocks);
num_lsb_radix_blocks);
//////////////
// msb
// We remove the last block (which is the sign)
auto are_all_msb_zeros = lwe_array_msb_out;
host_compare_blocks_with_zero<Torus>(
msb_streams, &are_all_msb_zeros, &msb, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
num_msb_radix_blocks, mem_ptr->is_zero_lut);
are_all_comparisons_block_true<Torus>(
msb_streams, &are_all_msb_zeros, &are_all_msb_zeros, mem_ptr, bsks,
ksks, ms_noise_reduction_key, are_all_msb_zeros.num_radix_blocks);
ksks, are_all_msb_zeros.num_radix_blocks);
auto sign_bit_pos = (int)log2(message_modulus) - 1;
@@ -536,15 +527,14 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
ksks, ms_noise_reduction_key, signed_msb_lut, 1,
signed_msb_lut->params.message_modulus);
ksks, signed_msb_lut, 1, signed_msb_lut->params.message_modulus);
lsb_streams.synchronize();
msb_streams.synchronize();
//////////////
// Reduce the two blocks into one final
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_lsb_out, mem_ptr,
sign_handler_f, bsks, ksks, ms_noise_reduction_key, 2);
sign_handler_f, bsks, ksks, 2);
} else {
if (num_radix_blocks == 1) {
@@ -579,8 +569,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array_out, lwe_array_in, bsks, ksks,
ms_noise_reduction_key, one_block_lut, 1);
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
one_block_lut->release(streams);
delete one_block_lut;
} else {
@@ -619,8 +608,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb<Torus>(
lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
num_lsb_radix_blocks);
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
CudaRadixCiphertextFFI encrypted_sign_block;
as_radix_ciphertext_slice<Torus>(&encrypted_sign_block, lwe_array_in,
num_radix_blocks - 1, num_radix_blocks);
@@ -636,8 +624,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
trivial_sign_block, bsks, ksks, ms_noise_reduction_key,
mem_ptr->signed_lut, 1, mem_ptr->signed_lut->params.message_modulus);
trivial_sign_block, bsks, ksks, mem_ptr->signed_lut, 1,
mem_ptr->signed_lut->params.message_modulus);
lsb_streams.synchronize();
msb_streams.synchronize();
@@ -645,8 +633,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// (inferior, equal, superior) to one single radix block containing the
// final sign
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_ct_out, mem_ptr,
sign_handler_f, bsks, ksks, ms_noise_reduction_key,
num_lsb_radix_blocks + 1);
sign_handler_f, bsks, ksks, num_lsb_radix_blocks + 1);
}
}
}
@@ -657,9 +644,7 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -671,13 +656,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
// is signed and scalar is positive
integer_radix_signed_scalar_difference_check_kb<Torus>(
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_scalar_blocks);
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_scalar_blocks);
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
num_scalar_blocks);
}
}
@@ -686,9 +671,8 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -706,8 +690,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
auto sign = mem_ptr->tmp_lwe_array_out;
host_integer_radix_scalar_difference_check_kb<Torus>(
streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
num_radix_blocks, num_scalar_blocks);
mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks, num_scalar_blocks);
// There is no optimized CMUX for scalars, so we convert to a trivial
// ciphertext
@@ -721,10 +704,9 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks, ms_noise_reduction_key);
host_integer_radix_cmux_kb<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
}
template <typename Torus>
@@ -732,9 +714,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -807,8 +787,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
bsks, ksks, ms_noise_reduction_key, scalar_comparison_luts,
num_halved_lsb_radix_blocks);
bsks, ksks, scalar_comparison_luts, num_halved_lsb_radix_blocks);
}
//////////////
// msb_in
@@ -825,12 +804,12 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
PANIC("Cuda error: integer operation not supported")
}
host_compare_blocks_with_zero<Torus>(
msb_streams, &msb_out, &msb_in, mem_ptr, bsks, ksks,
ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
are_all_comparisons_block_true<Torus>(
msb_streams, &msb_out, &msb_out, mem_ptr, bsks, ksks,
ms_noise_reduction_key, msb_out.num_radix_blocks);
host_compare_blocks_with_zero<Torus>(msb_streams, &msb_out, &msb_in,
mem_ptr, bsks, ksks,
num_msb_radix_blocks, msb_lut);
are_all_comparisons_block_true<Torus>(msb_streams, &msb_out, &msb_out,
mem_ptr, bsks, ksks,
msb_out.num_radix_blocks);
}
lsb_streams.synchronize();
@@ -840,13 +819,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
case COMPARISON_TYPE::EQ:
are_all_comparisons_block_true<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
ms_noise_reduction_key,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
case COMPARISON_TYPE::NE:
is_at_least_one_comparisons_block_true<Torus>(
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
ms_noise_reduction_key,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
default:

View File

@@ -24,13 +24,12 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
void cuda_integer_unsigned_scalar_div_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
host_integer_unsigned_scalar_div_radix<uint64_t>(
CudaStreams(streams), numerator_ct,
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, scalar_divisor_ffi);
scalar_divisor_ffi);
}
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
@@ -69,13 +68,12 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
void cuda_integer_signed_scalar_div_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
host_integer_signed_scalar_div_radix_kb<uint64_t>(
CudaStreams(streams), numerator_ct,
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
scalar_divisor_ffi, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
@@ -115,9 +113,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
void const *clear_blocks, void const *h_clear_blocks,
@@ -126,9 +122,9 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
CudaStreams(streams), quotient_ct, remainder_ct,
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
(uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
(uint64_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
decomposed_divisor, num_scalars_divisor, (uint64_t *)clear_blocks,
(uint64_t *)h_clear_blocks, num_clear_blocks);
}
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
@@ -168,9 +164,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits) {
@@ -178,9 +172,8 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
host_integer_signed_scalar_div_rem_radix<uint64_t>(
CudaStreams(streams), quotient_ct, remainder_ct,
(int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
numerator_bits);
(uint64_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
decomposed_divisor, num_scalars_divisor, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(

View File

@@ -27,9 +27,7 @@ template <typename Torus>
__host__ void host_integer_unsigned_scalar_div_radix(
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
if (scalar_divisor_ffi->is_abs_divisor_one) {
return;
@@ -38,7 +36,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
if (scalar_divisor_ffi->is_divisor_pow2) {
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
return;
}
@@ -65,26 +63,24 @@ __host__ void host_integer_unsigned_scalar_div_radix(
numerator_cpy, numerator_ct);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks,
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks, bsks,
scalar_divisor_ffi);
host_sub_and_propagate_single_carry<Torus>(
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
bsks, ksks, numerator_ct->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
(uint32_t)0);
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
return;
@@ -92,16 +88,16 @@ __host__ void host_integer_unsigned_scalar_div_radix(
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_pre,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
host_integer_radix_scalar_mul_high_kb<Torus>(
streams, numerator_ct, mem_ptr->scalar_mul_high_mem, ksks,
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
host_integer_radix_scalar_mul_high_kb<Torus>(streams, numerator_ct,
mem_ptr->scalar_mul_high_mem,
ksks, bsks, scalar_divisor_ffi);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, numerator_ct, scalar_divisor_ffi->shift_post,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
numerator_ct->num_radix_blocks);
}
@@ -125,9 +121,8 @@ template <typename Torus>
__host__ void host_integer_signed_scalar_div_radix_kb(
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t numerator_bits) {
if (scalar_divisor_ffi->is_abs_divisor_one) {
if (scalar_divisor_ffi->is_divisor_negative) {
@@ -158,23 +153,20 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, tmp,
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
tmp->num_radix_blocks);
mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
} else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
@@ -182,12 +174,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
ms_noise_reduction_key, bsks);
bsks);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
@@ -195,12 +186,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
host_sub_and_propagate_single_carry<Torus>(
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
bsks, ksks, FLAG_NONE, (uint32_t)0);
} else {
@@ -209,16 +199,15 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
ms_noise_reduction_key, bsks);
bsks);
host_add_and_propagate_single_carry<Torus>(
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
ksks, FLAG_NONE, (uint32_t)0);
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, tmp, scalar_divisor_ffi->shift_post,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
@@ -226,12 +215,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, xsign, numerator_bits - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
ms_noise_reduction_key);
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
host_sub_and_propagate_single_carry<Torus>(
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
bsks, ksks, FLAG_NONE, (uint32_t)0);
}
if (scalar_divisor_ffi->is_divisor_negative) {
@@ -263,9 +251,7 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
Torus const *clear_blocks, Torus const *h_clear_blocks,
@@ -275,18 +261,17 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, quotient_ct);
host_integer_unsigned_scalar_div_radix(
streams, quotient_ct, mem_ptr->unsigned_div_mem, bsks, ksks,
ms_noise_reduction_key, scalar_divisor_ffi);
host_integer_unsigned_scalar_div_radix(streams, quotient_ct,
mem_ptr->unsigned_div_mem, bsks, ksks,
scalar_divisor_ffi);
if (scalar_divisor_ffi->is_divisor_pow2) {
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);
host_integer_radix_scalar_bitop_kb(streams, remainder_ct, remainder_ct,
clear_blocks, h_clear_blocks,
num_clear_blocks, mem_ptr->bitop_mem,
bsks, ksks, ms_noise_reduction_key);
host_integer_radix_scalar_bitop_kb(
streams, remainder_ct, remainder_ct, clear_blocks, h_clear_blocks,
num_clear_blocks, mem_ptr->bitop_mem, bsks, ksks);
} else {
if (!scalar_divisor_ffi->is_divisor_zero) {
@@ -299,15 +284,13 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
host_integer_scalar_mul_radix<Torus>(
streams, remainder_ct, decomposed_divisor,
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
num_scalars_divisor);
mem_ptr->params.message_modulus, num_scalars_divisor);
}
}
host_sub_and_propagate_single_carry(
streams, numerator_ct, remainder_ct, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);
@@ -335,9 +318,7 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits) {
@@ -346,13 +327,13 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
numerator_ct, quotient_ct);
host_integer_signed_scalar_div_radix_kb(
streams, quotient_ct, mem_ptr->signed_div_mem, bsks, ksks,
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
host_integer_signed_scalar_div_radix_kb(streams, quotient_ct,
mem_ptr->signed_div_mem, bsks, ksks,
scalar_divisor_ffi, numerator_bits);
host_propagate_single_carry<Torus>(
streams, quotient_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
host_propagate_single_carry<Torus>(streams, quotient_ct, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE,
(uint32_t)0);
if (!scalar_divisor_ffi->is_divisor_negative &&
scalar_divisor_ffi->is_divisor_pow2) {
@@ -361,7 +342,7 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
host_integer_radix_logical_scalar_shift_kb_inplace(
streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
remainder_ct->num_radix_blocks);
} else if (!scalar_divisor_ffi->is_divisor_zero) {
@@ -375,15 +356,13 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
host_integer_scalar_mul_radix<Torus>(
streams, remainder_ct, decomposed_divisor,
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
num_scalars_divisor);
mem_ptr->params.message_modulus, num_scalars_divisor);
}
}
host_sub_and_propagate_single_carry(
streams, numerator_ct, remainder_ct, nullptr, nullptr,
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
FLAG_NONE, (uint32_t)0);
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
remainder_ct, numerator_ct);

View File

@@ -22,15 +22,13 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
int8_t *mem, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
int8_t *mem, void *const *bsks, void *const *ksks, uint32_t polynomial_size,
uint32_t message_modulus, uint32_t num_scalars) {
host_integer_scalar_mul_radix<uint64_t>(
CudaStreams(streams), lwe_array, decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
num_scalars);
(uint64_t **)(ksks), message_modulus, num_scalars);
}
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,

View File

@@ -46,7 +46,6 @@ __host__ void host_integer_scalar_mul_radix(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
T const *decomposed_scalar, T const *has_at_least_one_set,
int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t message_modulus, uint32_t num_scalars) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
@@ -69,7 +68,7 @@ __host__ void host_integer_scalar_mul_radix(
num_radix_blocks, lwe_array, 0, num_radix_blocks);
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
bsks, ksks, num_radix_blocks);
} else {
// create trivial assign for value = 0
set_zero_radix_ciphertext_slice_async<T>(
@@ -113,15 +112,14 @@ __host__ void host_integer_scalar_mul_radix(
} else {
host_integer_partial_sum_ciphertexts_vec_kb<T>(
streams, lwe_array, all_shifted_buffer, bsks, ksks,
ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
j);
mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
auto scp_mem_ptr = mem->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<T>(
streams, lwe_array, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
ms_noise_reduction_key, requested_flag, uses_carry);
host_propagate_single_carry<T>(streams, lwe_array, nullptr, nullptr,
scp_mem_ptr, bsks, ksks, requested_flag,
uses_carry);
}
}
@@ -170,7 +168,6 @@ template <typename Torus>
__host__ void host_integer_radix_scalar_mul_high_kb(
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
@@ -191,7 +188,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
tmp_ffi->num_radix_blocks);
} else {
@@ -199,8 +196,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
scalar_divisor_ffi->num_scalars);
mem_ptr->params.message_modulus, scalar_divisor_ffi->num_scalars);
}
}
@@ -211,9 +207,7 @@ template <typename Torus>
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
set_zero_radix_ciphertext_slice_async<Torus>(
@@ -225,7 +219,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
host_extend_radix_with_sign_msb<Torus>(
streams, tmp_ffi, ct, mem_ptr->extend_radix_mem, ct->num_radix_blocks,
bsks, (uint64_t **)ksks, ms_noise_reduction_key);
bsks, (uint64_t **)ksks);
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
@@ -235,14 +229,13 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
tmp_ffi->num_radix_blocks);
} else {
host_integer_scalar_mul_radix<Torus>(
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, mem_ptr->params.message_modulus,
scalar_divisor_ffi->num_scalars);
mem_ptr->params.message_modulus, scalar_divisor_ffi->num_scalars);
}
}

View File

@@ -22,13 +22,12 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
CudaStreams(streams), lwe_array, n,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,

View File

@@ -28,8 +28,7 @@ template <typename Torus>
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
Torus *const *ksks) {
auto num_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
@@ -74,8 +73,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, num_blocks,
lut_bivariate->params.message_modulus);
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
} else {
// rotate left as the blocks are from LSB to MSB
@@ -99,8 +97,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
ms_noise_reduction_key, lut_bivariate, num_blocks,
lut_bivariate->params.message_modulus);
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
}
}

View File

@@ -26,13 +26,12 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
/// rotations - 1 The remaining blocks are padded with zeros
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
CudaStreams(streams), lwe_array, shift,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, lwe_array->num_radix_blocks);
(uint64_t **)(ksks), lwe_array->num_radix_blocks);
}
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
@@ -64,13 +63,12 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
/// zeros as would be done in the logical shift.
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
CudaStreams(streams), lwe_array, shift,
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,

View File

@@ -28,9 +28,7 @@ template <typename Torus>
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t num_blocks) {
Torus *const *ksks, uint32_t num_blocks) {
if (lwe_array->num_radix_blocks < num_blocks)
PANIC("Cuda error: input does not have enough blocks")
@@ -81,9 +79,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, &partial_current_blocks, &partial_current_blocks,
&partial_previous_blocks, bsks, ksks, ms_noise_reduction_key,
lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
&partial_previous_blocks, bsks, ksks, lut_bivariate,
partial_block_count, lut_bivariate->params.message_modulus);
} else {
// right shift
@@ -113,8 +110,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, partial_current_blocks, partial_current_blocks,
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
partial_block_count, lut_bivariate->params.message_modulus);
&partial_next_blocks, bsks, ksks, lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
}
}
@@ -135,8 +132,7 @@ template <typename Torus>
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
Torus *const *ksks) {
auto num_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
@@ -205,9 +201,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, partial_current_blocks, partial_current_blocks,
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key,
lut_bivariate, partial_block_count,
lut_bivariate->params.message_modulus);
&partial_next_blocks, bsks, ksks, lut_bivariate,
partial_block_count, lut_bivariate->params.message_modulus);
}
// Since our CPU threads will be working on different streams we shall
// Ensure the work in the main stream is completed
@@ -216,7 +211,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
mem->lut_buffers_univariate[num_bits_in_block - 1];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
ms_noise_reduction_key, lut_univariate_padding_block, 1);
lut_univariate_padding_block, 1);
// Replace blocks 'pulled' from the left with the correct padding
// block
for (uint i = 0; i < rotations; i++) {
@@ -230,7 +225,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
mem->lut_buffers_univariate[shift_within_block - 1];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
ms_noise_reduction_key, lut_univariate_shift_last_block, 1);
lut_univariate_shift_last_block, 1);
}
mem->local_streams_1.synchronize();

View File

@@ -22,13 +22,12 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *ksks) {
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
CudaStreams(streams), lwe_array, lwe_shift,
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key);
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,

View File

@@ -29,8 +29,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
Torus *const *ksks) {
cuda_set_device(streams.gpu_index(0));
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
@@ -57,7 +56,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// Extract all bits
auto bits = mem->tmp_bits;
extract_n_bits<Torus>(streams, bits, lwe_array, bsks, ksks,
ms_noise_reduction_key,
num_radix_blocks * bits_per_block, num_radix_blocks,
mem->bit_extract_luts);
@@ -79,8 +77,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// so that it is already aligned to the correct position of the cmux input
// and we reduce noise growth
extract_n_bits<Torus>(streams, shift_bits, lwe_shift, bsks, ksks,
ms_noise_reduction_key, max_num_bits_that_tell_shift,
num_radix_blocks, mem->bit_extract_luts_with_offset_2);
max_num_bits_that_tell_shift, num_radix_blocks,
mem->bit_extract_luts_with_offset_2);
// If signed, do an "arithmetic shift" by padding with the sign bit
CudaRadixCiphertextFFI last_bit;
@@ -163,8 +161,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// we have
// control_bit|b|a
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, input_bits_a, mux_inputs, bsks, ksks, ms_noise_reduction_key,
mux_lut, total_nb_bits);
streams, input_bits_a, mux_inputs, bsks, ksks, mux_lut, total_nb_bits);
}
// Initializes the output
@@ -196,8 +193,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// To give back a clean ciphertext
auto cleaning_lut = mem->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, lwe_array, lwe_array, bsks, ksks, ms_noise_reduction_key,
cleaning_lut, num_radix_blocks);
streams, lwe_array, lwe_array, bsks, ksks, cleaning_lut,
num_radix_blocks);
}
}
#endif

View File

@@ -23,14 +23,12 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
PUSH_RANGE("sub")
host_sub_and_propagate_single_carry<uint64_t>(
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
(int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
ms_noise_reduction_key, requested_flag, uses_carry);
requested_flag, uses_carry);
POP_RANGE()
}

View File

@@ -33,7 +33,6 @@ void host_sub_and_propagate_single_carry(
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries,
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_integer_radix_negation<Torus>(
@@ -42,8 +41,7 @@ void host_sub_and_propagate_single_carry(
host_add_and_propagate_single_carry<Torus>(
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key, requested_flag,
uses_carry);
mem->sc_prop_mem, bsks, ksks, requested_flag, uses_carry);
}
template <typename Torus>

View File

@@ -204,20 +204,20 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
}
template <typename InputTorus, typename OutputTorus>
void execute_pbs_async(
CudaStreams streams, const LweArrayVariant<OutputTorus> &lwe_array_out,
const LweArrayVariant<InputTorus> &lwe_output_indexes,
const std::vector<OutputTorus *> lut_vec,
const std::vector<InputTorus *> lut_indexes_vec,
const LweArrayVariant<InputTorus> &lwe_array_in,
const LweArrayVariant<InputTorus> &lwe_input_indexes,
void *const *bootstrapping_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
uint32_t num_many_lut, uint32_t lut_stride) {
void execute_pbs_async(CudaStreams streams,
const LweArrayVariant<OutputTorus> &lwe_array_out,
const LweArrayVariant<InputTorus> &lwe_output_indexes,
const std::vector<OutputTorus *> lut_vec,
const std::vector<InputTorus *> lut_indexes_vec,
const LweArrayVariant<InputTorus> &lwe_array_in,
const LweArrayVariant<InputTorus> &lwe_input_indexes,
void *const *bootstrapping_keys,
std::vector<int8_t *> pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
uint32_t num_many_lut, uint32_t lut_stride) {
if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
// 32 bits
@@ -310,17 +310,13 @@ void execute_pbs_async(
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr &&
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], ms_noise_reduction_key, zeros, pbs_buffer[i],
lwe_dimension, glwe_dimension, polynomial_size, base_log,
level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu,
num_many_lut, lut_stride);
}
break;
default:
@@ -374,16 +370,11 @@ void execute_pbs_async(
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr &&
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
lut_vec[i], current_lwe_array_in, bootstrapping_keys[i],
ms_noise_reduction_key, zeros, pbs_buffer[i], lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count,
num_inputs_on_gpu);
pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, num_inputs_on_gpu);
}
break;
default:

View File

@@ -80,9 +80,7 @@ __global__ void device_programmable_bootstrap_cg(
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
(noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *

View File

@@ -650,33 +650,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *ms_drift_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
// If the parameters contain drift noise reduction key, then apply it
if (buffer->noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_drift_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, buffer->temp_lwe_array_in,
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t const *>(lwe_input_indexes),
static_cast<uint64_t *>(ms_drift_noise_reduction_ptr),
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
log_modulus);
} else {
buffer->temp_lwe_array_in =
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
}
check_cuda_error(cudaGetLastError());
switch (buffer->pbs_variant) {
@@ -687,7 +669,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
@@ -702,7 +684,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
@@ -714,7 +696,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,

View File

@@ -56,9 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
(noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *

View File

@@ -36,7 +36,7 @@ uint64_t scratch_cuda_programmable_bootstrap_128(
template <typename InputTorus>
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -83,7 +83,7 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
template <typename InputTorus>
void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -132,36 +132,17 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
__uint128_t const *lut_vector, void const *lwe_array_in,
void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_drift_noise_reduction_ptr,
void const *ms_noise_reduction_ptr,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
// If the parameters contain drift noise reduction key, then apply it
if (buffer->noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_drift_modulus_switch<InputTorus>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<InputTorus *>(buffer->temp_lwe_array_in),
static_cast<InputTorus const *>(lwe_array_in),
static_cast<uint64_t const *>(buffer->trivial_indexes),
static_cast<const InputTorus *>(ms_noise_reduction_ptr),
lwe_dimension + 1, num_samples, ms_drift_noise_reduction_ptr->num_zeros,
ms_drift_noise_reduction_ptr->ms_input_variance,
ms_drift_noise_reduction_ptr->ms_r_sigma,
ms_drift_noise_reduction_ptr->ms_bound, log_modulus);
} else {
buffer->temp_lwe_array_in =
const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
}
switch (buffer->pbs_variant) {
case DEFAULT:
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
lut_vector, static_cast<InputTorus const *>(lwe_array_in),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
break;
@@ -169,7 +150,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
InputTorus>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
lut_vector, static_cast<InputTorus const *>(lwe_array_in),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
break;
@@ -234,9 +215,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *streams, uint32_t gpu_index, void *lwe_array_out,
void const *lut_vector, void const *lwe_array_in,
void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
@@ -245,9 +224,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
streams, gpu_index, lwe_array_out,
static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
num_samples);
bootstrapping_key, buffer, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, num_samples);
}
/*

View File

@@ -668,7 +668,7 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
template <typename InputTorus, class params, bool first_iter>
__host__ void execute_step_one_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
InputTorus *lwe_array_in, double const *bootstrapping_key,
InputTorus const *lwe_array_in, double const *bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
PBS_MS_REDUCTION_T noise_reduction_type,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
@@ -752,7 +752,7 @@ __host__ void execute_step_two_128(
template <typename InputTorus, class params>
__host__ void host_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,

View File

@@ -84,9 +84,7 @@ __global__ void device_programmable_bootstrap_tbc(
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
(noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *

View File

@@ -43,8 +43,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
void cuda_expand_without_verification_64(
CudaStreamsFFI streams, void *lwe_array_out,
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
void *const *bsks, void *const *computing_ksks, void *const *casting_keys) {
auto expand_buffer = reinterpret_cast<zk_expand_mem<uint64_t> *>(mem_ptr);
@@ -54,49 +53,49 @@ void cuda_expand_without_verification_64(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
case 512:
host_expand_without_verification<uint64_t, AmortizedDegree<512>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
case 1024:
host_expand_without_verification<uint64_t, AmortizedDegree<1024>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
case 2048:
host_expand_without_verification<uint64_t, AmortizedDegree<2048>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
case 4096:
host_expand_without_verification<uint64_t, AmortizedDegree<4096>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
case 8192:
host_expand_without_verification<uint64_t, AmortizedDegree<8192>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
case 16384:
host_expand_without_verification<uint64_t, AmortizedDegree<16384>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
(uint64_t **)(computing_ksks));
break;
default:
PANIC("CUDA error: lwe_dimension not supported."

View File

@@ -19,8 +19,7 @@ template <typename Torus, class params>
__host__ void host_expand_without_verification(
CudaStreams streams, Torus *lwe_array_out,
const Torus *lwe_flattened_compact_array_in, zk_expand_mem<Torus> *mem_ptr,
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks) {
// Expand
auto casting_key_type = mem_ptr->casting_key_type;
auto expanded_lwes = mem_ptr->tmp_expanded_lwes;
@@ -96,8 +95,8 @@ __host__ void host_expand_without_verification(
auto input = new CudaRadixCiphertextFFI;
into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, output, input, bsks, ksks, ms_noise_reduction_key,
message_and_carry_extract_luts, 2 * num_lwes);
streams, output, input, bsks, ksks, message_and_carry_extract_luts,
2 * num_lwes);
}
template <typename Torus>

View File

@@ -191,9 +191,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, nullptr, nullptr,
pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, number_of_inputs, num_many_lut, lut_stride);
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, number_of_inputs, num_many_lut, lut_stride);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *

View File

@@ -98,37 +98,8 @@ pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
pub type PBS_TYPE = ffi::c_uint;
pub const PBS_MS_REDUCTION_T_NO_REDUCTION: PBS_MS_REDUCTION_T = 0;
pub const PBS_MS_REDUCTION_T_DRIFT: PBS_MS_REDUCTION_T = 1;
pub const PBS_MS_REDUCTION_T_CENTERED: PBS_MS_REDUCTION_T = 2;
pub const PBS_MS_REDUCTION_T_CENTERED: PBS_MS_REDUCTION_T = 1;
pub type PBS_MS_REDUCTION_T = ffi::c_uint;
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaModulusSwitchNoiseReductionKeyFFI {
pub ptr: *const *mut ffi::c_void,
pub num_zeros: u32,
pub ms_bound: f64,
pub ms_r_sigma: f64,
pub ms_input_variance: f64,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaModulusSwitchNoiseReductionKeyFFI"]
[::std::mem::size_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 40usize];
["Alignment of CudaModulusSwitchNoiseReductionKeyFFI"]
[::std::mem::align_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 8usize];
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ptr"]
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ptr) - 0usize];
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::num_zeros"]
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, num_zeros) - 8usize];
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_bound"]
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_bound) - 16usize];
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_r_sigma"]
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_r_sigma) - 24usize];
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_input_variance"][::std::mem::offset_of!(
CudaModulusSwitchNoiseReductionKeyFFI,
ms_input_variance
) - 32usize];
};
pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
@@ -382,7 +353,6 @@ unsafe extern "C" {
input_radix_lwe: *const CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
bsks: *const *mut ffi::c_void,
);
}
@@ -422,7 +392,6 @@ unsafe extern "C" {
input_radix_lwe_2: *const CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
bsks: *const *mut ffi::c_void,
num_radix_blocks: u32,
shift: u32,
@@ -441,7 +410,6 @@ unsafe extern "C" {
input_radix_lwe: *const CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
bsks: *const *mut ffi::c_void,
num_luts: u32,
lut_stride: u32,
@@ -472,7 +440,6 @@ unsafe extern "C" {
input_blocks: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
bsks: *const *mut ffi::c_void,
num_blocks: u32,
);
@@ -512,7 +479,6 @@ unsafe extern "C" {
is_bool_right: bool,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
mem_ptr: *mut i8,
polynomial_size: u32,
num_blocks: u32,
@@ -572,7 +538,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -605,7 +570,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -651,7 +615,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -692,7 +655,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -705,7 +667,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
num_scalar_blocks: u32,
);
}
@@ -743,7 +704,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -757,7 +717,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -794,7 +753,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -830,7 +788,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -892,7 +849,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
requested_flag: u32,
uses_carry: u32,
);
@@ -907,7 +863,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
requested_flag: u32,
uses_carry: u32,
);
@@ -953,7 +908,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
compute_overflow: u32,
uses_input_borrow: u32,
);
@@ -994,7 +948,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1033,7 +986,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
polynomial_size: u32,
message_modulus: u32,
num_scalars: u32,
@@ -1078,7 +1030,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1113,7 +1064,6 @@ unsafe extern "C" {
generates_or_propagates: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
bsks: *const *mut ffi::c_void,
num_blocks: u32,
);
@@ -1160,7 +1110,6 @@ unsafe extern "C" {
is_signed: bool,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1195,7 +1144,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
num_radix_blocks: u32,
);
}
@@ -1234,7 +1182,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
num_radix_blocks: u32,
);
}
@@ -1288,7 +1235,6 @@ unsafe extern "C" {
input_radix_lwe: *const CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
bsks: *const *mut ffi::c_void,
);
}
@@ -1330,7 +1276,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
requested_flag: u32,
uses_carry: u32,
);
@@ -1369,7 +1314,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
);
}
@@ -1409,7 +1353,6 @@ unsafe extern "C" {
num_additional_blocks: u32,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1446,7 +1389,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
numerator_bits: u32,
);
@@ -1487,7 +1429,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
divisor_has_at_least_one_set: *const u64,
decomposed_divisor: *const u64,
@@ -1533,7 +1474,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
divisor_has_at_least_one_set: *const u64,
decomposed_divisor: *const u64,
@@ -1578,7 +1518,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1617,7 +1556,6 @@ unsafe extern "C" {
num_blocks_to_process: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1659,7 +1597,6 @@ unsafe extern "C" {
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -1832,7 +1769,6 @@ unsafe extern "C" {
bsks: *const *mut ffi::c_void,
computing_ksks: *const *mut ffi::c_void,
casting_keys: *const *mut ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
);
}
unsafe extern "C" {
@@ -2300,8 +2236,6 @@ unsafe extern "C" {
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
ms_noise_reduction_ptr: *mut ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
@@ -2321,8 +2255,6 @@ unsafe extern "C" {
lut_vector: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
ms_noise_reduction_ptr: *const ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,