mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-07 22:04:10 -05:00
refactor(gpu): vector_find's functions to backend
This commit is contained in:
committed by
Agnès Leroy
parent
54cb87c491
commit
0aa0918fea
@@ -491,23 +491,6 @@ void cuda_integer_div_rem_radix_ciphertext_64(
|
||||
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array);
|
||||
|
||||
@@ -781,60 +764,6 @@ void cuda_integer_ilog2_64(
|
||||
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_compute_equality_selectors_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_compute_equality_selectors_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
|
||||
const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_create_possible_results_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_create_possible_results_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_list,
|
||||
uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
|
||||
uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_aggregate_one_hot_vector_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_aggregate_one_hot_vector_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_list,
|
||||
uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_match_value_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
@@ -894,6 +823,185 @@ void cuda_unchecked_match_value_or_64(
|
||||
|
||||
void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_contains_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_contains_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_contains_clear_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
const uint64_t *h_clear_val,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_is_in_clears_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_cleartexts,
|
||||
uint32_t num_clears, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_compute_final_index_from_selectors_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_compute_final_index_from_selectors_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
|
||||
uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_index_in_clears_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_cleartexts,
|
||||
uint32_t num_clears, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_first_index_in_clears_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_first_index_of_clear_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_first_index_of_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_unchecked_index_of_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
} // extern C
|
||||
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -284,46 +284,6 @@ void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
|
||||
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint64_t lut_degree, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_apply_bivariate_lut<uint64_t>(
|
||||
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
|
||||
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
|
||||
lut_degree, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, generates_or_propagates,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *lwe_array) {
|
||||
|
||||
|
||||
@@ -1,133 +1,5 @@
|
||||
#include "integer/vector_find.cuh"
|
||||
|
||||
uint64_t scratch_cuda_compute_equality_selectors_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_compute_equality_selectors<uint64_t>(
|
||||
CudaStreams(streams), (int_equality_selectors_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_possible_values, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_compute_equality_selectors_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
|
||||
const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_compute_equality_selectors<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out_list, lwe_array_in, num_blocks,
|
||||
h_decomposed_cleartexts, (int_equality_selectors_buffer<uint64_t> *)mem,
|
||||
bsks, (uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_equality_selectors_buffer<uint64_t> *mem_ptr =
|
||||
(int_equality_selectors_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_create_possible_results_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_create_possible_results<uint64_t>(
|
||||
CudaStreams(streams), (int_possible_results_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_blocks, num_possible_values, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_create_possible_results_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_list,
|
||||
uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
|
||||
uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_create_possible_results<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out_list, lwe_array_in_list,
|
||||
num_possible_values, h_decomposed_cleartexts, num_blocks,
|
||||
(int_possible_results_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_possible_results_buffer<uint64_t> *mem_ptr =
|
||||
(int_possible_results_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_aggregate_one_hot_vector_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_aggregate_one_hot_vector<uint64_t>(
|
||||
CudaStreams(streams), (int_aggregate_one_hot_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_blocks, num_matches, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_aggregate_one_hot_vector_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in_list,
|
||||
uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_aggregate_one_hot_vector<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in_list,
|
||||
num_input_ciphertexts, num_blocks,
|
||||
(int_aggregate_one_hot_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_aggregate_one_hot_buffer<uint64_t> *mem_ptr =
|
||||
(int_aggregate_one_hot_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_match_value_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
@@ -221,3 +93,410 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_contains_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_contains<uint64_t>(
|
||||
CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_inputs, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_contains_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_unchecked_contains<uint64_t>(
|
||||
CudaStreams(streams), output, inputs, value, num_inputs, num_blocks,
|
||||
(int_unchecked_contains_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_contains_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_contains_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_contains_clear_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_contains_clear<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
const uint64_t *h_clear_val,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_unchecked_contains_clear<uint64_t>(
|
||||
CudaStreams(streams), output, inputs, h_clear_val, num_inputs, num_blocks,
|
||||
(int_unchecked_contains_clear_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_contains_clear_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_contains_clear_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_is_in_clears_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_is_in_clears<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_clears, num_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_cleartexts,
|
||||
uint32_t num_clears, uint32_t num_blocks,
|
||||
int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_unchecked_is_in_clears<uint64_t>(
|
||||
CudaStreams(streams), output, input, h_cleartexts, num_clears, num_blocks,
|
||||
(int_unchecked_is_in_clears_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_is_in_clears_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_is_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_compute_final_index_from_selectors_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_compute_final_index_from_selectors<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_final_index_from_selectors_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks_index, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_compute_final_index_from_selectors_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
|
||||
uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_compute_final_index_from_selectors<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, selectors, num_inputs,
|
||||
num_blocks_index, (int_final_index_from_selectors_buffer<uint64_t> *)mem,
|
||||
bsks, (uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_final_index_from_selectors_buffer<uint64_t> *mem_ptr =
|
||||
(int_final_index_from_selectors_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_index_in_clears_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_cleartexts,
|
||||
uint32_t num_clears, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_unchecked_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, input, h_cleartexts, num_clears,
|
||||
num_blocks, num_blocks_index,
|
||||
(int_unchecked_index_in_clears_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_index_in_clears_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_first_index_in_clears_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_unchecked_first_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, input, h_unique_values,
|
||||
h_unique_indices, num_unique, num_blocks, num_blocks_index,
|
||||
(int_unchecked_first_index_in_clears_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_first_index_in_clears_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_first_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_first_index_of_clear_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_unchecked_first_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val, num_inputs,
|
||||
num_blocks, num_blocks_index,
|
||||
(int_unchecked_first_index_of_clear_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_first_index_of_clear_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_first_index_of_clear_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_first_index_of_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_first_index_of<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_unchecked_first_index_of<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
|
||||
num_blocks, num_blocks_index,
|
||||
(int_unchecked_first_index_of_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_first_index_of_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_first_index_of_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_unchecked_index_of_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
message_modulus, carry_modulus, noise_reduction_type);
|
||||
|
||||
return scratch_cuda_unchecked_index_of<uint64_t>(
|
||||
CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_unchecked_index_of<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
|
||||
num_blocks, num_blocks_index,
|
||||
(int_unchecked_index_of_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_unchecked_index_of_buffer<uint64_t> *mem_ptr =
|
||||
(int_unchecked_index_of_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
@@ -28,22 +28,23 @@ __host__ void host_compute_equality_selectors(
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i),
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams_vec[j].gpu_index(i));
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
for (uint32_t i = 0; i < num_possible_values; i++) {
|
||||
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
|
||||
CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx];
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI *current_tmp_block_comparisons =
|
||||
mem_ptr->tmp_block_comparisons_vec[stream_idx];
|
||||
mem_ptr->tmp_block_comparisons[stream_idx];
|
||||
int_comparison_buffer<Torus> *current_reduction_buffer =
|
||||
mem_ptr->reduction_buffers[stream_idx];
|
||||
|
||||
@@ -75,10 +76,11 @@ __host__ void host_compute_equality_selectors(
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j][i],
|
||||
mem_ptr->sub_streams_vec[j].stream(i),
|
||||
mem_ptr->sub_streams_vec[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i],
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
@@ -110,24 +112,25 @@ __host__ void host_create_possible_results(
|
||||
uint32_t max_luts_per_call = mem_ptr->max_luts_per_call;
|
||||
uint32_t num_lut_accumulators = mem_ptr->num_lut_accumulators;
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i),
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams_vec[j].gpu_index(i));
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_possible_values; i++) {
|
||||
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx];
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
CudaRadixCiphertextFFI *current_tmp_buffer =
|
||||
mem_ptr->tmp_many_luts_output_vec[stream_idx];
|
||||
mem_ptr->tmp_many_luts_output[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI const *current_selector = &lwe_array_in_list[i];
|
||||
CudaRadixCiphertextFFI *current_output = &lwe_array_out_list[i];
|
||||
@@ -138,7 +141,7 @@ __host__ void host_create_possible_results(
|
||||
|
||||
uint32_t lut_index = stream_idx * num_lut_accumulators + k;
|
||||
|
||||
int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts_vec[lut_index];
|
||||
int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts[lut_index];
|
||||
|
||||
uint32_t luts_in_this_call = current_lut->num_many_lut;
|
||||
|
||||
@@ -172,10 +175,11 @@ __host__ void host_create_possible_results(
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j][i],
|
||||
mem_ptr->sub_streams_vec[j].stream(i),
|
||||
mem_ptr->sub_streams_vec[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i],
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
@@ -206,15 +210,16 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
int_radix_params params = mem_ptr->params;
|
||||
uint32_t chunk_size = mem_ptr->chunk_size;
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t s = 0; s < num_streams; s++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams_vec[s].stream(i),
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[s].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams_vec[s].gpu_index(i));
|
||||
mem_ptr->sub_streams[s].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -223,7 +228,7 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
|
||||
for (uint32_t s = 0; s < num_streams; s++) {
|
||||
|
||||
CudaStreams current_stream = mem_ptr->sub_streams_vec[s];
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[s];
|
||||
|
||||
CudaRadixCiphertextFFI *current_agg =
|
||||
mem_ptr->partial_aggregated_vectors[s];
|
||||
@@ -287,10 +292,11 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
|
||||
for (uint32_t s = 0; s < num_streams; s++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[s][i],
|
||||
mem_ptr->sub_streams_vec[s].stream(i),
|
||||
mem_ptr->sub_streams_vec[s].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[s][i],
|
||||
cuda_event_record(mem_ptr->outgoing_events[s * num_gpus + i],
|
||||
mem_ptr->sub_streams[s].stream(i),
|
||||
mem_ptr->sub_streams[s].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[s * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
@@ -322,8 +328,8 @@ __host__ void host_aggregate_one_hot_vector(
|
||||
streams.stream(0), streams.gpu_index(0), temp_agg, 0, num_blocks,
|
||||
final_agg, 0, num_blocks);
|
||||
|
||||
CudaStreams message_stream = mem_ptr->sub_streams_vec[0];
|
||||
CudaStreams carry_stream = mem_ptr->sub_streams_vec[1];
|
||||
CudaStreams message_stream = mem_ptr->sub_streams[0];
|
||||
CudaStreams carry_stream = mem_ptr->sub_streams[1];
|
||||
|
||||
cuda_event_record(mem_ptr->reduction_done_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
@@ -498,3 +504,609 @@ __host__ void host_unchecked_match_value_or(
|
||||
mem_ptr->tmp_match_result, mem_ptr->tmp_or_value,
|
||||
mem_ptr->cmux_buffer, bsks, (Torus **)ksks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t
|
||||
scratch_cuda_unchecked_contains(CudaStreams streams,
|
||||
int_unchecked_contains_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_inputs,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_contains_buffer<Torus>(
|
||||
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int_unchecked_contains_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
for (uint32_t i = 0; i < num_inputs; i++) {
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
|
||||
|
||||
CudaRadixCiphertextFFI current_selector_block;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_selector_block,
|
||||
mem_ptr->packed_selectors, i, i + 1);
|
||||
|
||||
host_equality_check<Torus>(current_stream, ¤t_selector_block,
|
||||
input_ct, value, mem_ptr->eq_buffers[stream_idx],
|
||||
bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
|
||||
bsks, (Torus **)ksks, num_inputs);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_contains_clear(
|
||||
CudaStreams streams, int_unchecked_contains_clear_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_contains_clear_buffer<Torus>(
|
||||
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unchecked_contains_clear(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *inputs, const uint64_t *h_clear_val,
|
||||
uint32_t num_inputs, uint32_t num_blocks,
|
||||
int_unchecked_contains_clear_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
|
||||
num_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
set_trivial_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val,
|
||||
mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
|
||||
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
for (uint32_t i = 0; i < num_inputs; i++) {
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
|
||||
|
||||
CudaRadixCiphertextFFI current_selector_block;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_selector_block,
|
||||
mem_ptr->packed_selectors, i, i + 1);
|
||||
|
||||
host_equality_check<Torus>(current_stream, ¤t_selector_block,
|
||||
input_ct, mem_ptr->tmp_clear_val,
|
||||
mem_ptr->eq_buffers[stream_idx], bsks, ksks,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
|
||||
bsks, (Torus **)ksks, num_inputs);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_is_in_clears(
|
||||
CudaStreams streams, int_unchecked_is_in_clears_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_is_in_clears_buffer<Torus>(
|
||||
streams, params, num_clears, num_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_unchecked_is_in_clears(CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_cleartexts, uint32_t num_clears,
|
||||
uint32_t num_blocks,
|
||||
int_unchecked_is_in_clears_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
|
||||
input, num_blocks, h_cleartexts,
|
||||
mem_ptr->eq_buffer, bsks, ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
|
||||
bsks, (Torus **)ksks, num_clears);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_compute_final_index_from_selectors(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
|
||||
uint32_t num_inputs, uint32_t num_blocks_index,
|
||||
int_final_index_from_selectors_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
for (uint32_t i = 0; i < num_inputs; i++) {
|
||||
CudaRadixCiphertextFFI const *src_selector = &selectors[i];
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->packed_selectors, i,
|
||||
i + 1, src_selector, 0, 1);
|
||||
}
|
||||
|
||||
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
||||
|
||||
host_create_possible_results<Torus>(
|
||||
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
|
||||
num_inputs, mem_ptr->h_indices, packed_len, mem_ptr->possible_results_buf,
|
||||
bsks, ksks);
|
||||
|
||||
host_aggregate_one_hot_vector<Torus>(
|
||||
streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
|
||||
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
|
||||
bsks, (Torus **)ksks, num_inputs);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_compute_final_index_from_selectors(
|
||||
CudaStreams streams, int_final_index_from_selectors_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks_index,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_final_index_from_selectors_buffer<Torus>(
|
||||
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_index_in_clears(
|
||||
CudaStreams streams, int_unchecked_index_in_clears_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_index_in_clears_buffer<Torus>(
|
||||
streams, params, num_clears, num_blocks, num_blocks_index,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unchecked_index_in_clears(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_cleartexts, uint32_t num_clears, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index,
|
||||
int_unchecked_index_in_clears_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
host_compute_equality_selectors<Torus>(
|
||||
streams, mem_ptr->final_index_buf->unpacked_selectors, input, num_blocks,
|
||||
h_cleartexts, mem_ptr->eq_selectors_buf, bsks, ksks);
|
||||
|
||||
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
||||
|
||||
host_create_possible_results<Torus>(
|
||||
streams, mem_ptr->final_index_buf->possible_results_ct_list,
|
||||
mem_ptr->final_index_buf->unpacked_selectors, num_clears,
|
||||
mem_ptr->final_index_buf->h_indices, packed_len,
|
||||
mem_ptr->final_index_buf->possible_results_buf, bsks, ksks);
|
||||
|
||||
host_aggregate_one_hot_vector<Torus>(
|
||||
streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list,
|
||||
num_clears, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks,
|
||||
ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, match_ct, mem_ptr->final_index_buf->packed_selectors,
|
||||
mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks,
|
||||
num_clears);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_first_index_in_clears(
|
||||
CudaStreams streams,
|
||||
int_unchecked_first_index_in_clears_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_unique, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_first_index_in_clears_buffer<Torus>(
|
||||
streams, params, num_unique, num_blocks, num_blocks_index,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unchecked_first_index_in_clears(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
|
||||
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int_unchecked_first_index_in_clears_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
|
||||
input, num_blocks, h_unique_values,
|
||||
mem_ptr->eq_selectors_buf, bsks, ksks);
|
||||
|
||||
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
||||
|
||||
host_create_possible_results<Torus>(
|
||||
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
|
||||
num_unique, h_unique_indices, packed_len, mem_ptr->possible_results_buf,
|
||||
bsks, ksks);
|
||||
|
||||
host_aggregate_one_hot_vector<Torus>(
|
||||
streams, index_ct, mem_ptr->possible_results_ct_list, num_unique,
|
||||
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
|
||||
bsks, (Torus **)ksks, num_unique);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_first_index_of_clear(
|
||||
CudaStreams streams,
|
||||
int_unchecked_first_index_of_clear_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_first_index_of_clear_buffer<Torus>(
|
||||
streams, params, num_inputs, num_blocks, num_blocks_index,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unchecked_first_index_of_clear(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index,
|
||||
int_unchecked_first_index_of_clear_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks) {
|
||||
|
||||
cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
|
||||
num_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
set_trivial_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val,
|
||||
mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
|
||||
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
for (uint32_t i = 0; i < num_inputs; i++) {
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
|
||||
|
||||
CudaRadixCiphertextFFI current_selector_block;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_selector_block,
|
||||
mem_ptr->packed_selectors, i, i + 1);
|
||||
|
||||
host_equality_check<Torus>(current_stream, ¤t_selector_block,
|
||||
input_ct, mem_ptr->tmp_clear_val,
|
||||
mem_ptr->eq_buffers[stream_idx], bsks, ksks,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
|
||||
uint32_t count = num_inputs - offset;
|
||||
|
||||
CudaRadixCiphertextFFI current_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_slice, mem_ptr->packed_selectors,
|
||||
offset, num_inputs);
|
||||
|
||||
CudaRadixCiphertextFFI prev_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&prev_slice, mem_ptr->packed_selectors, 0,
|
||||
count);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, ¤t_slice, ¤t_slice, &prev_slice, bsks, ksks,
|
||||
mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus);
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks,
|
||||
mem_ptr->cleanup_lut, num_inputs);
|
||||
|
||||
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
||||
|
||||
host_create_possible_results<Torus>(
|
||||
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
|
||||
num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len,
|
||||
mem_ptr->possible_results_buf, bsks, ksks);
|
||||
|
||||
host_aggregate_one_hot_vector<Torus>(
|
||||
streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
|
||||
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
|
||||
bsks, (Torus **)ksks, num_inputs);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_first_index_of(
|
||||
CudaStreams streams, int_unchecked_first_index_of_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_first_index_of_buffer<Torus>(
|
||||
streams, params, num_inputs, num_blocks, num_blocks_index,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unchecked_first_index_of(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
|
||||
uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int_unchecked_first_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
for (uint32_t i = 0; i < num_inputs; i++) {
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
|
||||
|
||||
CudaRadixCiphertextFFI current_selector_block;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_selector_block,
|
||||
mem_ptr->packed_selectors, i, i + 1);
|
||||
|
||||
host_equality_check<Torus>(current_stream, ¤t_selector_block,
|
||||
input_ct, value, mem_ptr->eq_buffers[stream_idx],
|
||||
bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
|
||||
uint32_t count = num_inputs - offset;
|
||||
|
||||
CudaRadixCiphertextFFI current_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_slice, mem_ptr->packed_selectors,
|
||||
offset, num_inputs);
|
||||
|
||||
CudaRadixCiphertextFFI prev_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&prev_slice, mem_ptr->packed_selectors, 0,
|
||||
count);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
streams, ¤t_slice, ¤t_slice, &prev_slice, bsks, ksks,
|
||||
mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus);
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks,
|
||||
mem_ptr->cleanup_lut, num_inputs);
|
||||
|
||||
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
||||
|
||||
host_create_possible_results<Torus>(
|
||||
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
|
||||
num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len,
|
||||
mem_ptr->possible_results_buf, bsks, ksks);
|
||||
|
||||
host_aggregate_one_hot_vector<Torus>(
|
||||
streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
|
||||
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
|
||||
bsks, (Torus **)ksks, num_inputs);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_unchecked_index_of(
|
||||
CudaStreams streams, int_unchecked_index_of_buffer<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
|
||||
uint32_t num_blocks_index, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_unchecked_index_of_buffer<Torus>(
|
||||
streams, params, num_inputs, num_blocks, num_blocks_index,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unchecked_index_of(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
|
||||
uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int_unchecked_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks) {
|
||||
|
||||
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->incoming_event,
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t num_streams = mem_ptr->num_streams;
|
||||
uint32_t num_gpus = mem_ptr->active_streams.count();
|
||||
|
||||
for (uint32_t i = 0; i < num_inputs; i++) {
|
||||
uint32_t stream_idx = i % num_streams;
|
||||
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
|
||||
|
||||
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
|
||||
|
||||
CudaRadixCiphertextFFI current_selector_block;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_selector_block,
|
||||
mem_ptr->final_index_buf->packed_selectors,
|
||||
i, i + 1);
|
||||
|
||||
host_equality_check<Torus>(current_stream, ¤t_selector_block,
|
||||
input_ct, value, mem_ptr->eq_buffers[stream_idx],
|
||||
bsks, ksks, num_blocks);
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
|
||||
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
|
||||
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
mem_ptr->sub_streams[j].stream(i),
|
||||
mem_ptr->sub_streams[j].gpu_index(i));
|
||||
cuda_stream_wait_event(streams.stream(0),
|
||||
mem_ptr->outgoing_events[j * num_gpus + i],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
||||
|
||||
host_create_possible_results<Torus>(
|
||||
streams, mem_ptr->final_index_buf->possible_results_ct_list,
|
||||
mem_ptr->final_index_buf->unpacked_selectors, num_inputs,
|
||||
(const uint64_t *)mem_ptr->final_index_buf->h_indices, packed_len,
|
||||
mem_ptr->final_index_buf->possible_results_buf, bsks, ksks);
|
||||
|
||||
host_aggregate_one_hot_vector<Torus>(
|
||||
streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list,
|
||||
num_inputs, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks,
|
||||
ksks);
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, match_ct, mem_ptr->final_index_buf->packed_selectors,
|
||||
mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks,
|
||||
num_inputs);
|
||||
}
|
||||
|
||||
@@ -1100,45 +1100,6 @@ unsafe extern "C" {
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_div_rem(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
input_lut: *const ffi::c_void,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_radix_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
lut_degree: u64,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
generates_or_propagates: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_reverse_blocks_64_inplace(
|
||||
streams: CudaStreamsFFI,
|
||||
@@ -1715,127 +1676,6 @@ unsafe extern "C" {
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_compute_equality_selectors_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_possible_values: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_compute_equality_selectors_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out_list: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in: *const CudaRadixCiphertextFFI,
|
||||
num_blocks: u32,
|
||||
h_decomposed_cleartexts: *const u64,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_compute_equality_selectors_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_create_possible_results_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_possible_values: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_create_possible_results_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out_list: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in_list: *const CudaRadixCiphertextFFI,
|
||||
num_possible_values: u32,
|
||||
h_decomposed_cleartexts: *const u64,
|
||||
num_blocks: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_create_possible_results_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_aggregate_one_hot_vector_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks: u32,
|
||||
num_matches: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_aggregate_one_hot_vector_64(
|
||||
streams: CudaStreamsFFI,
|
||||
lwe_array_out: *mut CudaRadixCiphertextFFI,
|
||||
lwe_array_in_list: *const CudaRadixCiphertextFFI,
|
||||
num_input_ciphertexts: u32,
|
||||
num_blocks: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_aggregate_one_hot_vector_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_match_value_64(
|
||||
streams: CudaStreamsFFI,
|
||||
@@ -1962,6 +1802,385 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_contains_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_contains_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
inputs: *const CudaRadixCiphertextFFI,
|
||||
value: *const CudaRadixCiphertextFFI,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_contains_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_contains_clear_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_contains_clear_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
inputs: *const CudaRadixCiphertextFFI,
|
||||
h_clear_val: *const u64,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_contains_clear_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_is_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_clears: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_is_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
h_cleartexts: *const u64,
|
||||
num_clears: u32,
|
||||
num_blocks: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_is_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_compute_final_index_from_selectors_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks_index: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_compute_final_index_from_selectors_64(
|
||||
streams: CudaStreamsFFI,
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
selectors: *const CudaRadixCiphertextFFI,
|
||||
num_inputs: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_compute_final_index_from_selectors_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_index_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_clears: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_index_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
h_cleartexts: *const u64,
|
||||
num_clears: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_index_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_first_index_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_unique: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_first_index_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
h_unique_values: *const u64,
|
||||
h_unique_indices: *const u64,
|
||||
num_unique: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_first_index_in_clears_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_first_index_of_clear_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_first_index_of_clear_64(
|
||||
streams: CudaStreamsFFI,
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
inputs: *const CudaRadixCiphertextFFI,
|
||||
h_clear_val: *const u64,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_first_index_of_clear_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_first_index_of_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_first_index_of_64(
|
||||
streams: CudaStreamsFFI,
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
inputs: *const CudaRadixCiphertextFFI,
|
||||
value: *const CudaRadixCiphertextFFI,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_first_index_of_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_unchecked_index_of_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
big_lwe_dimension: u32,
|
||||
small_lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_unchecked_index_of_64(
|
||||
streams: CudaStreamsFFI,
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
inputs: *const CudaRadixCiphertextFFI,
|
||||
value: *const CudaRadixCiphertextFFI,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_unchecked_index_of_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -18,7 +18,7 @@ use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut,
|
||||
cuda_backend_apply_univariate_lut, cuda_backend_cast_to_unsigned,
|
||||
cuda_backend_compute_prefix_sum_hillis_steele, cuda_backend_extend_radix_with_sign_msb,
|
||||
cuda_backend_extend_radix_with_sign_msb,
|
||||
cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign,
|
||||
cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign,
|
||||
cuda_backend_trim_radix_blocks_lsb, cuda_backend_trim_radix_blocks_msb, CudaServerKey, PBSType,
|
||||
@@ -1094,134 +1094,6 @@ impl CudaServerKey {
|
||||
ciphertexts
|
||||
}
|
||||
|
||||
/// Applies the lookup table on the range of ciphertexts
|
||||
///
|
||||
/// The output must have exactly block_range.len() blocks
|
||||
pub(crate) fn compute_prefix_sum_hillis_steele(
|
||||
&self,
|
||||
output: &mut CudaRadixCiphertext,
|
||||
generates_or_propagates: &mut CudaRadixCiphertext,
|
||||
lut: &BivariateLookupTableOwned,
|
||||
block_range: std::ops::Range<usize>,
|
||||
streams: &CudaStreams,
|
||||
) {
|
||||
if block_range.is_empty() {
|
||||
return;
|
||||
}
|
||||
assert_eq!(
|
||||
generates_or_propagates.d_blocks.lwe_dimension(),
|
||||
output.d_blocks.lwe_dimension()
|
||||
);
|
||||
|
||||
let lwe_dimension = generates_or_propagates.d_blocks.lwe_dimension();
|
||||
let lwe_size = lwe_dimension.to_lwe_size().0;
|
||||
let num_blocks = block_range.len();
|
||||
|
||||
let mut generates_or_propagates_slice = generates_or_propagates
|
||||
.d_blocks
|
||||
.0
|
||||
.d_vec
|
||||
.as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0)
|
||||
.unwrap();
|
||||
let mut generates_or_propagates_degrees = vec![0; num_blocks];
|
||||
let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
|
||||
for (i, block_index) in (block_range.clone()).enumerate() {
|
||||
generates_or_propagates_degrees[i] =
|
||||
generates_or_propagates.info.blocks[block_index].degree.0;
|
||||
generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks
|
||||
[block_index]
|
||||
.noise_level
|
||||
.0;
|
||||
}
|
||||
let mut output_slice = output
|
||||
.d_blocks
|
||||
.0
|
||||
.d_vec
|
||||
.as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0)
|
||||
.unwrap();
|
||||
let mut output_degrees = vec![0_u64; num_blocks];
|
||||
let mut output_noise_levels = vec![0_u64; num_blocks];
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_compute_prefix_sum_hillis_steele(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
&mut output_noise_levels,
|
||||
&mut generates_or_propagates_slice,
|
||||
&mut generates_or_propagates_degrees,
|
||||
&mut generates_or_propagates_noise_levels,
|
||||
lut.acc.acc.as_ref(),
|
||||
lut.acc.degree.0,
|
||||
&d_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
num_blocks as u32,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
cuda_backend_compute_prefix_sum_hillis_steele(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
&mut output_noise_levels,
|
||||
&mut generates_or_propagates_slice,
|
||||
&mut generates_or_propagates_degrees,
|
||||
&mut generates_or_propagates_noise_levels,
|
||||
lut.acc.acc.as_ref(),
|
||||
lut.acc.degree.0,
|
||||
&d_multibit_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
num_blocks as u32,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i, info) in output.info.blocks[block_range.start..block_range.end]
|
||||
.iter_mut()
|
||||
.enumerate()
|
||||
{
|
||||
info.degree = Degree(output_degrees[i]);
|
||||
info.noise_level = NoiseLevel(output_noise_levels[i]);
|
||||
}
|
||||
for (i, info) in generates_or_propagates.info.blocks[block_range.start..block_range.end]
|
||||
.iter_mut()
|
||||
.enumerate()
|
||||
{
|
||||
info.degree = Degree(generates_or_propagates_degrees[i]);
|
||||
info.noise_level = NoiseLevel(generates_or_propagates_noise_levels[i]);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn extend_radix_with_sign_msb<T: CudaIntegerRadixCiphertext>(
|
||||
&self,
|
||||
ct: &T,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user