refactor(gpu): vector_find's functions to backend

This commit is contained in:
Enzo Di Maria
2025-11-24 14:57:34 +01:00
committed by Agnès Leroy
parent 54cb87c491
commit 0aa0918fea
9 changed files with 4731 additions and 1900 deletions

View File

@@ -491,23 +491,6 @@ void cuda_integer_div_rem_radix_ciphertext_64(
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_blocks);
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array);
@@ -781,60 +764,6 @@ void cuda_integer_ilog2_64(
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_compute_equality_selectors_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_compute_equality_selectors_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks,
void *const *ksks);
void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_create_possible_results_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_create_possible_results_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
CudaRadixCiphertextFFI const *lwe_array_in_list,
uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks);
void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_aggregate_one_hot_vector_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_aggregate_one_hot_vector_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_list,
uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem,
void *const *bsks, void *const *ksks);
void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_match_value_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -894,6 +823,185 @@ void cuda_unchecked_match_value_or_64(
void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_contains_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_contains_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
int8_t *mem, void *const *bsks,
void *const *ksks);
void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_contains_clear_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs,
const uint64_t *h_clear_val,
uint32_t num_inputs, uint32_t num_blocks,
int8_t *mem, void *const *bsks,
void *const *ksks);
void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_is_in_clears_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts,
uint32_t num_clears, uint32_t num_blocks,
int8_t *mem, void *const *bsks,
void *const *ksks);
void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_compute_final_index_from_selectors_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_compute_final_index_from_selectors_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks);
void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_index_in_clears_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts,
uint32_t num_clears, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks);
void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_first_index_in_clears_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
int8_t *mem, void *const *bsks, void *const *ksks);
void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_first_index_of_clear_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
void *const *ksks);
void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_first_index_of_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks);
void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_unchecked_index_of_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks);
void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);
} // extern C
#endif // CUDA_INTEGER_H

File diff suppressed because it is too large Load Diff

View File

@@ -284,46 +284,6 @@ void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
POP_RANGE()
}
uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint64_t lut_degree, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_bivariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
void cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
host_compute_prefix_sum_hillis_steele<uint64_t>(
CudaStreams(streams), output_radix_lwe, generates_or_propagates,
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_radix_blocks);
}
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array) {

View File

@@ -1,133 +1,5 @@
#include "integer/vector_find.cuh"
uint64_t scratch_cuda_compute_equality_selectors_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_compute_equality_selectors<uint64_t>(
CudaStreams(streams), (int_equality_selectors_buffer<uint64_t> **)mem_ptr,
params, num_possible_values, num_blocks, allocate_gpu_memory);
}
void cuda_compute_equality_selectors_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks,
void *const *ksks) {
host_compute_equality_selectors<uint64_t>(
CudaStreams(streams), lwe_array_out_list, lwe_array_in, num_blocks,
h_decomposed_cleartexts, (int_equality_selectors_buffer<uint64_t> *)mem,
bsks, (uint64_t *const *)ksks);
}
void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_equality_selectors_buffer<uint64_t> *mem_ptr =
(int_equality_selectors_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_create_possible_results_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_create_possible_results<uint64_t>(
CudaStreams(streams), (int_possible_results_buffer<uint64_t> **)mem_ptr,
params, num_blocks, num_possible_values, allocate_gpu_memory);
}
void cuda_create_possible_results_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
CudaRadixCiphertextFFI const *lwe_array_in_list,
uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks) {
host_create_possible_results<uint64_t>(
CudaStreams(streams), lwe_array_out_list, lwe_array_in_list,
num_possible_values, h_decomposed_cleartexts, num_blocks,
(int_possible_results_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_possible_results_buffer<uint64_t> *mem_ptr =
(int_possible_results_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_aggregate_one_hot_vector_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_aggregate_one_hot_vector<uint64_t>(
CudaStreams(streams), (int_aggregate_one_hot_buffer<uint64_t> **)mem_ptr,
params, num_blocks, num_matches, allocate_gpu_memory);
}
void cuda_aggregate_one_hot_vector_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_list,
uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem,
void *const *bsks, void *const *ksks) {
host_aggregate_one_hot_vector<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in_list,
num_input_ciphertexts, num_blocks,
(int_aggregate_one_hot_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_aggregate_one_hot_buffer<uint64_t> *mem_ptr =
(int_aggregate_one_hot_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_match_value_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -221,3 +93,410 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_contains_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_contains<uint64_t>(
CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_contains_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
int8_t *mem, void *const *bsks,
void *const *ksks) {
host_unchecked_contains<uint64_t>(
CudaStreams(streams), output, inputs, value, num_inputs, num_blocks,
(int_unchecked_contains_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_contains_buffer<uint64_t> *mem_ptr =
(int_unchecked_contains_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_contains_clear_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_contains_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs,
const uint64_t *h_clear_val,
uint32_t num_inputs, uint32_t num_blocks,
int8_t *mem, void *const *bsks,
void *const *ksks) {
host_unchecked_contains_clear<uint64_t>(
CudaStreams(streams), output, inputs, h_clear_val, num_inputs, num_blocks,
(int_unchecked_contains_clear_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_contains_clear_buffer<uint64_t> *mem_ptr =
(int_unchecked_contains_clear_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_is_in_clears_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_is_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_clears, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts,
uint32_t num_clears, uint32_t num_blocks,
int8_t *mem, void *const *bsks,
void *const *ksks) {
host_unchecked_is_in_clears<uint64_t>(
CudaStreams(streams), output, input, h_cleartexts, num_clears, num_blocks,
(int_unchecked_is_in_clears_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_is_in_clears_buffer<uint64_t> *mem_ptr =
(int_unchecked_is_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_compute_final_index_from_selectors_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_compute_final_index_from_selectors<uint64_t>(
CudaStreams(streams),
(int_final_index_from_selectors_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks_index, allocate_gpu_memory);
}
void cuda_compute_final_index_from_selectors_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks) {
host_compute_final_index_from_selectors<uint64_t>(
CudaStreams(streams), index_ct, match_ct, selectors, num_inputs,
num_blocks_index, (int_final_index_from_selectors_buffer<uint64_t> *)mem,
bsks, (uint64_t *const *)ksks);
}
void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_final_index_from_selectors_buffer<uint64_t> *mem_ptr =
(int_final_index_from_selectors_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_index_in_clears_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_index_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts,
uint32_t num_clears, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks) {
host_unchecked_index_in_clears<uint64_t>(
CudaStreams(streams), index_ct, match_ct, input, h_cleartexts, num_clears,
num_blocks, num_blocks_index,
(int_unchecked_index_in_clears_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_index_in_clears_buffer<uint64_t> *mem_ptr =
(int_unchecked_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_first_index_in_clears_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
int8_t *mem, void *const *bsks, void *const *ksks) {
host_unchecked_first_index_in_clears<uint64_t>(
CudaStreams(streams), index_ct, match_ct, input, h_unique_values,
h_unique_indices, num_unique, num_blocks, num_blocks_index,
(int_unchecked_first_index_in_clears_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_first_index_in_clears_buffer<uint64_t> *mem_ptr =
(int_unchecked_first_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_first_index_of_clear_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
void *const *ksks) {
host_unchecked_first_index_of_clear<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val, num_inputs,
num_blocks, num_blocks_index,
(int_unchecked_first_index_of_clear_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_first_index_of_clear_buffer<uint64_t> *mem_ptr =
(int_unchecked_first_index_of_clear_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_first_index_of_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_first_index_of<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks) {
host_unchecked_first_index_of<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
num_blocks, num_blocks_index,
(int_unchecked_first_index_of_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_first_index_of_buffer<uint64_t> *mem_ptr =
(int_unchecked_first_index_of_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}
uint64_t scratch_cuda_unchecked_index_of_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_index_of<uint64_t>(
CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, int8_t *mem,
void *const *bsks, void *const *ksks) {
host_unchecked_index_of<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
num_blocks, num_blocks_index,
(int_unchecked_index_of_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_index_of_buffer<uint64_t> *mem_ptr =
(int_unchecked_index_of_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
}

View File

@@ -28,22 +28,23 @@ __host__ void host_compute_equality_selectors(
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i),
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams_vec[j].gpu_index(i));
mem_ptr->sub_streams[j].gpu_index(i));
}
}
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_possible_values; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx];
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI *current_tmp_block_comparisons =
mem_ptr->tmp_block_comparisons_vec[stream_idx];
mem_ptr->tmp_block_comparisons[stream_idx];
int_comparison_buffer<Torus> *current_reduction_buffer =
mem_ptr->reduction_buffers[stream_idx];
@@ -75,10 +76,11 @@ __host__ void host_compute_equality_selectors(
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j][i],
mem_ptr->sub_streams_vec[j].stream(i),
mem_ptr->sub_streams_vec[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i],
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
@@ -110,24 +112,25 @@ __host__ void host_create_possible_results(
uint32_t max_luts_per_call = mem_ptr->max_luts_per_call;
uint32_t num_lut_accumulators = mem_ptr->num_lut_accumulators;
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i),
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams_vec[j].gpu_index(i));
mem_ptr->sub_streams[j].gpu_index(i));
}
}
for (uint32_t i = 0; i < num_possible_values; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx];
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI *current_tmp_buffer =
mem_ptr->tmp_many_luts_output_vec[stream_idx];
mem_ptr->tmp_many_luts_output[stream_idx];
CudaRadixCiphertextFFI const *current_selector = &lwe_array_in_list[i];
CudaRadixCiphertextFFI *current_output = &lwe_array_out_list[i];
@@ -138,7 +141,7 @@ __host__ void host_create_possible_results(
uint32_t lut_index = stream_idx * num_lut_accumulators + k;
int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts_vec[lut_index];
int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts[lut_index];
uint32_t luts_in_this_call = current_lut->num_many_lut;
@@ -172,10 +175,11 @@ __host__ void host_create_possible_results(
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j][i],
mem_ptr->sub_streams_vec[j].stream(i),
mem_ptr->sub_streams_vec[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i],
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
@@ -206,15 +210,16 @@ __host__ void host_aggregate_one_hot_vector(
int_radix_params params = mem_ptr->params;
uint32_t chunk_size = mem_ptr->chunk_size;
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t s = 0; s < num_streams; s++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams_vec[s].stream(i),
cuda_stream_wait_event(mem_ptr->sub_streams[s].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams_vec[s].gpu_index(i));
mem_ptr->sub_streams[s].gpu_index(i));
}
}
@@ -223,7 +228,7 @@ __host__ void host_aggregate_one_hot_vector(
for (uint32_t s = 0; s < num_streams; s++) {
CudaStreams current_stream = mem_ptr->sub_streams_vec[s];
CudaStreams current_stream = mem_ptr->sub_streams[s];
CudaRadixCiphertextFFI *current_agg =
mem_ptr->partial_aggregated_vectors[s];
@@ -287,10 +292,11 @@ __host__ void host_aggregate_one_hot_vector(
for (uint32_t s = 0; s < num_streams; s++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[s][i],
mem_ptr->sub_streams_vec[s].stream(i),
mem_ptr->sub_streams_vec[s].gpu_index(i));
cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[s][i],
cuda_event_record(mem_ptr->outgoing_events[s * num_gpus + i],
mem_ptr->sub_streams[s].stream(i),
mem_ptr->sub_streams[s].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[s * num_gpus + i],
streams.gpu_index(0));
}
}
@@ -322,8 +328,8 @@ __host__ void host_aggregate_one_hot_vector(
streams.stream(0), streams.gpu_index(0), temp_agg, 0, num_blocks,
final_agg, 0, num_blocks);
CudaStreams message_stream = mem_ptr->sub_streams_vec[0];
CudaStreams carry_stream = mem_ptr->sub_streams_vec[1];
CudaStreams message_stream = mem_ptr->sub_streams[0];
CudaStreams carry_stream = mem_ptr->sub_streams[1];
cuda_event_record(mem_ptr->reduction_done_event, streams.stream(0),
streams.gpu_index(0));
@@ -498,3 +504,609 @@ __host__ void host_unchecked_match_value_or(
mem_ptr->tmp_match_result, mem_ptr->tmp_or_value,
mem_ptr->cmux_buffer, bsks, (Torus **)ksks);
}
template <typename Torus>
uint64_t
scratch_cuda_unchecked_contains(CudaStreams streams,
int_unchecked_contains_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs,
uint32_t num_blocks, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_contains_buffer<Torus>(
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void
host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
int_unchecked_contains_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
CudaRadixCiphertextFFI current_selector_block;
as_radix_ciphertext_slice<Torus>(&current_selector_block,
mem_ptr->packed_selectors, i, i + 1);
host_equality_check<Torus>(current_stream, &current_selector_block,
input_ct, value, mem_ptr->eq_buffers[stream_idx],
bsks, ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_contains_clear(
CudaStreams streams, int_unchecked_contains_clear_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_contains_clear_buffer<Torus>(
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unchecked_contains_clear(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs, const uint64_t *h_clear_val,
uint32_t num_inputs, uint32_t num_blocks,
int_unchecked_contains_clear_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
num_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0));
set_trivial_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val,
mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
CudaRadixCiphertextFFI current_selector_block;
as_radix_ciphertext_slice<Torus>(&current_selector_block,
mem_ptr->packed_selectors, i, i + 1);
host_equality_check<Torus>(current_stream, &current_selector_block,
input_ct, mem_ptr->tmp_clear_val,
mem_ptr->eq_buffers[stream_idx], bsks, ksks,
num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_is_in_clears(
CudaStreams streams, int_unchecked_is_in_clears_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_is_in_clears_buffer<Torus>(
streams, params, num_clears, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void
host_unchecked_is_in_clears(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts, uint32_t num_clears,
uint32_t num_blocks,
int_unchecked_is_in_clears_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
input, num_blocks, h_cleartexts,
mem_ptr->eq_buffer, bsks, ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
bsks, (Torus **)ksks, num_clears);
}
template <typename Torus>
__host__ void host_compute_final_index_from_selectors(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
uint32_t num_inputs, uint32_t num_blocks_index,
int_final_index_from_selectors_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
for (uint32_t i = 0; i < num_inputs; i++) {
CudaRadixCiphertextFFI const *src_selector = &selectors[i];
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->packed_selectors, i,
i + 1, src_selector, 0, 1);
}
uint32_t packed_len = (num_blocks_index + 1) / 2;
host_create_possible_results<Torus>(
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
num_inputs, mem_ptr->h_indices, packed_len, mem_ptr->possible_results_buf,
bsks, ksks);
host_aggregate_one_hot_vector<Torus>(
streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
uint64_t scratch_cuda_compute_final_index_from_selectors(
CudaStreams streams, int_final_index_from_selectors_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks_index,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_final_index_from_selectors_buffer<Torus>(
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_index_in_clears(
CudaStreams streams, int_unchecked_index_in_clears_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_index_in_clears_buffer<Torus>(
streams, params, num_clears, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unchecked_index_in_clears(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts, uint32_t num_clears, uint32_t num_blocks,
uint32_t num_blocks_index,
int_unchecked_index_in_clears_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
host_compute_equality_selectors<Torus>(
streams, mem_ptr->final_index_buf->unpacked_selectors, input, num_blocks,
h_cleartexts, mem_ptr->eq_selectors_buf, bsks, ksks);
uint32_t packed_len = (num_blocks_index + 1) / 2;
host_create_possible_results<Torus>(
streams, mem_ptr->final_index_buf->possible_results_ct_list,
mem_ptr->final_index_buf->unpacked_selectors, num_clears,
mem_ptr->final_index_buf->h_indices, packed_len,
mem_ptr->final_index_buf->possible_results_buf, bsks, ksks);
host_aggregate_one_hot_vector<Torus>(
streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list,
num_clears, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks,
ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, match_ct, mem_ptr->final_index_buf->packed_selectors,
mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks,
num_clears);
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_first_index_in_clears(
CudaStreams streams,
int_unchecked_first_index_in_clears_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_unique, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_first_index_in_clears_buffer<Torus>(
streams, params, num_unique, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unchecked_first_index_in_clears(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
int_unchecked_first_index_in_clears_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
input, num_blocks, h_unique_values,
mem_ptr->eq_selectors_buf, bsks, ksks);
uint32_t packed_len = (num_blocks_index + 1) / 2;
host_create_possible_results<Torus>(
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
num_unique, h_unique_indices, packed_len, mem_ptr->possible_results_buf,
bsks, ksks);
host_aggregate_one_hot_vector<Torus>(
streams, index_ct, mem_ptr->possible_results_ct_list, num_unique,
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
bsks, (Torus **)ksks, num_unique);
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_first_index_of_clear(
CudaStreams streams,
int_unchecked_first_index_of_clear_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_first_index_of_clear_buffer<Torus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unchecked_first_index_of_clear(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index,
int_unchecked_first_index_of_clear_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
num_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0));
set_trivial_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val,
mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
CudaRadixCiphertextFFI current_selector_block;
as_radix_ciphertext_slice<Torus>(&current_selector_block,
mem_ptr->packed_selectors, i, i + 1);
host_equality_check<Torus>(current_stream, &current_selector_block,
input_ct, mem_ptr->tmp_clear_val,
mem_ptr->eq_buffers[stream_idx], bsks, ksks,
num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
uint32_t count = num_inputs - offset;
CudaRadixCiphertextFFI current_slice;
as_radix_ciphertext_slice<Torus>(&current_slice, mem_ptr->packed_selectors,
offset, num_inputs);
CudaRadixCiphertextFFI prev_slice;
as_radix_ciphertext_slice<Torus>(&prev_slice, mem_ptr->packed_selectors, 0,
count);
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &current_slice, &current_slice, &prev_slice, bsks, ksks,
mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus);
}
integer_radix_apply_univariate_lookup_table<Torus>(
streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks,
mem_ptr->cleanup_lut, num_inputs);
uint32_t packed_len = (num_blocks_index + 1) / 2;
host_create_possible_results<Torus>(
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len,
mem_ptr->possible_results_buf, bsks, ksks);
host_aggregate_one_hot_vector<Torus>(
streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_first_index_of(
CudaStreams streams, int_unchecked_first_index_of_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_first_index_of_buffer<Torus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unchecked_first_index_of(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
uint32_t num_blocks, uint32_t num_blocks_index,
int_unchecked_first_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
CudaRadixCiphertextFFI current_selector_block;
as_radix_ciphertext_slice<Torus>(&current_selector_block,
mem_ptr->packed_selectors, i, i + 1);
host_equality_check<Torus>(current_stream, &current_selector_block,
input_ct, value, mem_ptr->eq_buffers[stream_idx],
bsks, ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
uint32_t count = num_inputs - offset;
CudaRadixCiphertextFFI current_slice;
as_radix_ciphertext_slice<Torus>(&current_slice, mem_ptr->packed_selectors,
offset, num_inputs);
CudaRadixCiphertextFFI prev_slice;
as_radix_ciphertext_slice<Torus>(&prev_slice, mem_ptr->packed_selectors, 0,
count);
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, &current_slice, &current_slice, &prev_slice, bsks, ksks,
mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus);
}
integer_radix_apply_univariate_lookup_table<Torus>(
streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks,
mem_ptr->cleanup_lut, num_inputs);
uint32_t packed_len = (num_blocks_index + 1) / 2;
host_create_possible_results<Torus>(
streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len,
mem_ptr->possible_results_buf, bsks, ksks);
host_aggregate_one_hot_vector<Torus>(
streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
packed_len, mem_ptr->aggregate_buf, bsks, ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
uint64_t scratch_cuda_unchecked_index_of(
CudaStreams streams, int_unchecked_index_of_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_index_of_buffer<Torus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_unchecked_index_of(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
uint32_t num_blocks, uint32_t num_blocks_index,
int_unchecked_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
CudaRadixCiphertextFFI current_selector_block;
as_radix_ciphertext_slice<Torus>(&current_selector_block,
mem_ptr->final_index_buf->packed_selectors,
i, i + 1);
host_equality_check<Torus>(current_stream, &current_selector_block,
input_ct, value, mem_ptr->eq_buffers[stream_idx],
bsks, ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
uint32_t packed_len = (num_blocks_index + 1) / 2;
host_create_possible_results<Torus>(
streams, mem_ptr->final_index_buf->possible_results_ct_list,
mem_ptr->final_index_buf->unpacked_selectors, num_inputs,
(const uint64_t *)mem_ptr->final_index_buf->h_indices, packed_len,
mem_ptr->final_index_buf->possible_results_buf, bsks, ksks);
host_aggregate_one_hot_vector<Torus>(
streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list,
num_inputs, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks,
ksks);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, match_ct, mem_ptr->final_index_buf->packed_selectors,
mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks,
num_inputs);
}

View File

@@ -1100,45 +1100,6 @@ unsafe extern "C" {
unsafe extern "C" {
pub fn cleanup_cuda_integer_div_rem(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
input_lut: *const ffi::c_void,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_radix_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
lut_degree: u64,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_compute_prefix_sum_hillis_steele_64(
streams: CudaStreamsFFI,
output_radix_lwe: *mut CudaRadixCiphertextFFI,
generates_or_propagates: *mut CudaRadixCiphertextFFI,
mem_ptr: *mut i8,
ksks: *const *mut ffi::c_void,
bsks: *const *mut ffi::c_void,
num_blocks: u32,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cuda_integer_reverse_blocks_64_inplace(
streams: CudaStreamsFFI,
@@ -1715,127 +1676,6 @@ unsafe extern "C" {
unsafe extern "C" {
pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_compute_equality_selectors_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_possible_values: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_compute_equality_selectors_64(
streams: CudaStreamsFFI,
lwe_array_out_list: *mut CudaRadixCiphertextFFI,
lwe_array_in: *const CudaRadixCiphertextFFI,
num_blocks: u32,
h_decomposed_cleartexts: *const u64,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_compute_equality_selectors_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_create_possible_results_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_possible_values: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_create_possible_results_64(
streams: CudaStreamsFFI,
lwe_array_out_list: *mut CudaRadixCiphertextFFI,
lwe_array_in_list: *const CudaRadixCiphertextFFI,
num_possible_values: u32,
h_decomposed_cleartexts: *const u64,
num_blocks: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_create_possible_results_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_aggregate_one_hot_vector_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
num_matches: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_aggregate_one_hot_vector_64(
streams: CudaStreamsFFI,
lwe_array_out: *mut CudaRadixCiphertextFFI,
lwe_array_in_list: *const CudaRadixCiphertextFFI,
num_input_ciphertexts: u32,
num_blocks: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_aggregate_one_hot_vector_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_match_value_64(
streams: CudaStreamsFFI,
@@ -1962,6 +1802,385 @@ unsafe extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_contains_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_contains_64(
streams: CudaStreamsFFI,
output: *mut CudaRadixCiphertextFFI,
inputs: *const CudaRadixCiphertextFFI,
value: *const CudaRadixCiphertextFFI,
num_inputs: u32,
num_blocks: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_contains_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_contains_clear_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_contains_clear_64(
streams: CudaStreamsFFI,
output: *mut CudaRadixCiphertextFFI,
inputs: *const CudaRadixCiphertextFFI,
h_clear_val: *const u64,
num_inputs: u32,
num_blocks: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_contains_clear_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_is_in_clears_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_clears: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_is_in_clears_64(
streams: CudaStreamsFFI,
output: *mut CudaRadixCiphertextFFI,
input: *const CudaRadixCiphertextFFI,
h_cleartexts: *const u64,
num_clears: u32,
num_blocks: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_is_in_clears_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_compute_final_index_from_selectors_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_inputs: u32,
num_blocks_index: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_compute_final_index_from_selectors_64(
streams: CudaStreamsFFI,
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
selectors: *const CudaRadixCiphertextFFI,
num_inputs: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_compute_final_index_from_selectors_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_index_in_clears_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_clears: u32,
num_blocks: u32,
num_blocks_index: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_index_in_clears_64(
streams: CudaStreamsFFI,
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
input: *const CudaRadixCiphertextFFI,
h_cleartexts: *const u64,
num_clears: u32,
num_blocks: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_index_in_clears_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_first_index_in_clears_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_unique: u32,
num_blocks: u32,
num_blocks_index: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_first_index_in_clears_64(
streams: CudaStreamsFFI,
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
input: *const CudaRadixCiphertextFFI,
h_unique_values: *const u64,
h_unique_indices: *const u64,
num_unique: u32,
num_blocks: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_first_index_in_clears_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_first_index_of_clear_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_first_index_of_clear_64(
streams: CudaStreamsFFI,
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
inputs: *const CudaRadixCiphertextFFI,
h_clear_val: *const u64,
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_first_index_of_clear_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_first_index_of_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_first_index_of_64(
streams: CudaStreamsFFI,
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
inputs: *const CudaRadixCiphertextFFI,
value: *const CudaRadixCiphertextFFI,
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_first_index_of_64(
streams: CudaStreamsFFI,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_unchecked_index_of_64(
streams: CudaStreamsFFI,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_unchecked_index_of_64(
streams: CudaStreamsFFI,
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
inputs: *const CudaRadixCiphertextFFI,
value: *const CudaRadixCiphertextFFI,
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_unchecked_index_of_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
streams: CudaStreamsFFI,

File diff suppressed because it is too large Load Diff

View File

@@ -18,7 +18,7 @@ use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{
cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut,
cuda_backend_apply_univariate_lut, cuda_backend_cast_to_unsigned,
cuda_backend_compute_prefix_sum_hillis_steele, cuda_backend_extend_radix_with_sign_msb,
cuda_backend_extend_radix_with_sign_msb,
cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign,
cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign,
cuda_backend_trim_radix_blocks_lsb, cuda_backend_trim_radix_blocks_msb, CudaServerKey, PBSType,
@@ -1094,134 +1094,6 @@ impl CudaServerKey {
ciphertexts
}
/// Applies the lookup table on the range of ciphertexts
///
/// The output must have exactly block_range.len() blocks
pub(crate) fn compute_prefix_sum_hillis_steele(
&self,
output: &mut CudaRadixCiphertext,
generates_or_propagates: &mut CudaRadixCiphertext,
lut: &BivariateLookupTableOwned,
block_range: std::ops::Range<usize>,
streams: &CudaStreams,
) {
if block_range.is_empty() {
return;
}
assert_eq!(
generates_or_propagates.d_blocks.lwe_dimension(),
output.d_blocks.lwe_dimension()
);
let lwe_dimension = generates_or_propagates.d_blocks.lwe_dimension();
let lwe_size = lwe_dimension.to_lwe_size().0;
let num_blocks = block_range.len();
let mut generates_or_propagates_slice = generates_or_propagates
.d_blocks
.0
.d_vec
.as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0)
.unwrap();
let mut generates_or_propagates_degrees = vec![0; num_blocks];
let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
for (i, block_index) in (block_range.clone()).enumerate() {
generates_or_propagates_degrees[i] =
generates_or_propagates.info.blocks[block_index].degree.0;
generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks
[block_index]
.noise_level
.0;
}
let mut output_slice = output
.d_blocks
.0
.d_vec
.as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0)
.unwrap();
let mut output_degrees = vec![0_u64; num_blocks];
let mut output_noise_levels = vec![0_u64; num_blocks];
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_compute_prefix_sum_hillis_steele(
streams,
&mut output_slice,
&mut output_degrees,
&mut output_noise_levels,
&mut generates_or_propagates_slice,
&mut generates_or_propagates_degrees,
&mut generates_or_propagates_noise_levels,
lut.acc.acc.as_ref(),
lut.acc.degree.0,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
num_blocks as u32,
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
LweBskGroupingFactor(0),
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
cuda_backend_compute_prefix_sum_hillis_steele(
streams,
&mut output_slice,
&mut output_degrees,
&mut output_noise_levels,
&mut generates_or_propagates_slice,
&mut generates_or_propagates_degrees,
&mut generates_or_propagates_noise_levels,
lut.acc.acc.as_ref(),
lut.acc.degree.0,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.key_switching_key
.output_key_lwe_size()
.to_lwe_dimension(),
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
num_blocks as u32,
self.message_modulus,
self.carry_modulus,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
None,
);
}
}
}
for (i, info) in output.info.blocks[block_range.start..block_range.end]
.iter_mut()
.enumerate()
{
info.degree = Degree(output_degrees[i]);
info.noise_level = NoiseLevel(output_noise_levels[i]);
}
for (i, info) in generates_or_propagates.info.blocks[block_range.start..block_range.end]
.iter_mut()
.enumerate()
{
info.degree = Degree(generates_or_propagates_degrees[i]);
info.noise_level = NoiseLevel(generates_or_propagates_noise_levels[i]);
}
}
pub(crate) fn extend_radix_with_sign_msb<T: CudaIntegerRadixCiphertext>(
&self,
ct: &T,

File diff suppressed because it is too large Load Diff