diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index 65b2c8f9e..ca5503a96 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -491,23 +491,6 @@ void cuda_integer_div_rem_radix_ciphertext_64( void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams, int8_t **mem_ptr_void); -uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - uint64_t lut_degree, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type); - -void cuda_integer_compute_prefix_sum_hillis_steele_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe, - CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr, - void *const *ksks, void *const *bsks, uint32_t num_blocks); - -void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - CudaStreamsFFI streams, int8_t **mem_ptr_void); - void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array); @@ -781,60 +764,6 @@ void cuda_integer_ilog2_64( void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams, int8_t **mem_ptr_void); -uint64_t scratch_cuda_compute_equality_selectors_64( - CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type); - -void cuda_compute_equality_selectors_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list, - CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks, - const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks, - void *const *ksks); - -void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams, - int8_t **mem_ptr_void); - -uint64_t scratch_cuda_create_possible_results_64( - CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type); - -void cuda_create_possible_results_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list, - CudaRadixCiphertextFFI const *lwe_array_in_list, - uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts, - uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks); - -void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams, - int8_t **mem_ptr_void); - -uint64_t scratch_cuda_aggregate_one_hot_vector_64( - CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type); - -void cuda_aggregate_one_hot_vector_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out, - CudaRadixCiphertextFFI const *lwe_array_in_list, - uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem, - void *const *bsks, void *const *ksks); - -void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams, - int8_t **mem_ptr_void); - uint64_t scratch_cuda_unchecked_match_value_64( CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, @@ -894,6 +823,185 @@ void cuda_unchecked_match_value_or_64( void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams, int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_contains_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_contains_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + int8_t *mem, void *const *bsks, + void *const *ksks); + +void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_contains_clear_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *inputs, + const uint64_t *h_clear_val, + uint32_t num_inputs, uint32_t num_blocks, + int8_t *mem, void *const *bsks, + void *const *ksks); + +void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_is_in_clears_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *input, + const uint64_t *h_cleartexts, + uint32_t num_clears, uint32_t num_blocks, + int8_t *mem, void *const *bsks, + void *const *ksks); + +void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_compute_final_index_from_selectors_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_compute_final_index_from_selectors_64( + CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors, + uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks); + +void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_index_in_clears_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, + CudaRadixCiphertextFFI const *input, + const uint64_t *h_cleartexts, + uint32_t num_clears, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks); + +void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_first_index_in_clears_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_first_index_in_clears_64( + CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input, + const uint64_t *h_unique_values, const uint64_t *h_unique_indices, + uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index, + int8_t *mem, void *const *bsks, void *const *ksks); + +void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_first_index_of_clear_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_first_index_of_clear_64( + CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs, + const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, void *const *bsks, + void *const *ksks); + +void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_first_index_of_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks); + +void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); + +uint64_t scratch_cuda_unchecked_index_of_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + +void cuda_unchecked_index_of_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks); + +void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void); } // extern C #endif // CUDA_INTEGER_H diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h index 2fc65a3c5..55bc89be7 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h @@ -18,13 +18,13 @@ template struct int_equality_selectors_buffer { CudaRadixCiphertextFFI *tmp_many_luts_output; CudaStreams active_streams; - std::vector sub_streams_vec; + CudaStreams *sub_streams; cudaEvent_t incoming_event; - std::vector> outgoing_events; + cudaEvent_t *outgoing_events; uint32_t num_streams; - std::vector tmp_block_comparisons_vec; - std::vector *> reduction_buffers; + CudaRadixCiphertextFFI **tmp_block_comparisons; + int_comparison_buffer **reduction_buffers; int_equality_selectors_buffer(CudaStreams streams, int_radix_params params, uint32_t num_possible_values, @@ -42,17 +42,18 @@ template struct int_equality_selectors_buffer { this->num_streams = num_streams_to_use; this->active_streams = streams.active_gpu_subset(num_blocks); + uint32_t num_gpus = active_streams.count(); incoming_event = cuda_create_event(streams.gpu_index(0)); - sub_streams_vec.resize(num_streams_to_use); - outgoing_events.resize(num_streams_to_use); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus]; for (uint32_t i = 0; i < num_streams_to_use; i++) { - sub_streams_vec[i].create_on_same_gpus(active_streams); - outgoing_events[i].resize(active_streams.count()); - for (uint32_t j = 0; j < active_streams.count(); j++) { - outgoing_events[i][j] = cuda_create_event(active_streams.gpu_index(j)); + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < num_gpus; j++) { + outgoing_events[i * num_gpus + j] = + cuda_create_event(active_streams.gpu_index(j)); } } @@ -88,17 +89,19 @@ template struct int_equality_selectors_buffer { params.message_modulus * num_blocks, params.big_lwe_dimension, size_tracker, allocate_gpu_memory); - this->tmp_block_comparisons_vec.resize(this->num_streams); - this->reduction_buffers.resize(this->num_streams); + this->tmp_block_comparisons = + new CudaRadixCiphertextFFI *[this->num_streams]; + this->reduction_buffers = + new int_comparison_buffer *[this->num_streams]; for (uint32_t j = 0; j < this->num_streams; j++) { - this->tmp_block_comparisons_vec[j] = new CudaRadixCiphertextFFI; + this->tmp_block_comparisons[j] = new CudaRadixCiphertextFFI; create_zero_radix_ciphertext_async( streams.stream(0), streams.gpu_index(0), - this->tmp_block_comparisons_vec[j], num_blocks, - params.big_lwe_dimension, size_tracker, allocate_gpu_memory); + this->tmp_block_comparisons[j], num_blocks, params.big_lwe_dimension, + size_tracker, allocate_gpu_memory); this->reduction_buffers[j] = new int_comparison_buffer( - sub_streams_vec[j], COMPARISON_TYPE::EQ, params, num_blocks, false, + sub_streams[j], COMPARISON_TYPE::EQ, params, num_blocks, false, allocate_gpu_memory, size_tracker); } } @@ -112,33 +115,37 @@ template struct int_equality_selectors_buffer { this->allocate_gpu_memory); delete this->tmp_many_luts_output; - for (auto ct : this->tmp_block_comparisons_vec) { + for (uint32_t i = 0; i < this->num_streams; i++) { release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), - ct, this->allocate_gpu_memory); - delete ct; + this->tmp_block_comparisons[i], + this->allocate_gpu_memory); + delete this->tmp_block_comparisons[i]; } - this->tmp_block_comparisons_vec.clear(); + delete[] this->tmp_block_comparisons; - for (auto buffer : this->reduction_buffers) { - buffer->release(streams); - delete buffer; + for (uint32_t i = 0; i < this->num_streams; i++) { + this->reduction_buffers[i]->release(streams); + delete this->reduction_buffers[i]; } - this->reduction_buffers.clear(); + delete[] this->reduction_buffers; cuda_event_destroy(incoming_event, streams.gpu_index(0)); - for (uint j = 0; j < num_streams; j++) { - for (uint k = 0; k < active_streams.count(); k++) { - cuda_event_destroy(outgoing_events[j][k], active_streams.gpu_index(k)); + + uint32_t num_gpus = active_streams.count(); + for (uint32_t i = 0; i < num_streams; i++) { + for (uint32_t j = 0; j < num_gpus; j++) { + cuda_event_destroy(outgoing_events[i * num_gpus + j], + active_streams.gpu_index(j)); } } - outgoing_events.clear(); + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); - - for (auto &stream : sub_streams_vec) { - stream.release(); - } - sub_streams_vec.clear(); } }; @@ -151,15 +158,15 @@ template struct int_possible_results_buffer { uint32_t num_lut_accumulators; uint32_t lut_stride; - std::vector *> stream_luts_vec; + int_radix_lut **stream_luts; CudaStreams active_streams; - std::vector sub_streams_vec; + CudaStreams *sub_streams; cudaEvent_t incoming_event; - std::vector> outgoing_events; + cudaEvent_t *outgoing_events; uint32_t num_streams; - std::vector tmp_many_luts_output_vec; + CudaRadixCiphertextFFI **tmp_many_luts_output; int_possible_results_buffer(CudaStreams streams, int_radix_params params, uint32_t num_blocks, uint32_t num_possible_values, @@ -176,17 +183,18 @@ template struct int_possible_results_buffer { this->num_streams = num_streams_to_use; this->active_streams = streams.active_gpu_subset(num_blocks); + uint32_t num_gpus = active_streams.count(); incoming_event = cuda_create_event(streams.gpu_index(0)); - sub_streams_vec.resize(num_streams_to_use); - outgoing_events.resize(num_streams_to_use); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus]; for (uint32_t i = 0; i < num_streams_to_use; i++) { - sub_streams_vec[i].create_on_same_gpus(active_streams); - outgoing_events[i].resize(active_streams.count()); - for (uint32_t j = 0; j < active_streams.count(); j++) { - outgoing_events[i][j] = cuda_create_event(active_streams.gpu_index(j)); + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < num_gpus; j++) { + outgoing_events[i * num_gpus + j] = + cuda_create_event(active_streams.gpu_index(j)); } } @@ -207,11 +215,13 @@ template struct int_possible_results_buffer { this->num_lut_accumulators = (total_luts_needed + max_luts_per_call - 1) / max_luts_per_call; - stream_luts_vec.reserve(num_streams * num_lut_accumulators); + stream_luts = + new int_radix_lut *[num_streams * num_lut_accumulators]; std::vector> fns; fns.reserve(max_luts_per_call); + uint32_t lut_count = 0; for (uint32_t s = 0; s < num_streams; s++) { uint32_t lut_value_start = 0; @@ -221,7 +231,7 @@ template struct int_possible_results_buffer { std::min(max_luts_per_call, total_luts_needed - lut_value_start); int_radix_lut *current_lut = new int_radix_lut( - sub_streams_vec[s], params, 1, 1, luts_in_this_call, + sub_streams[s], params, 1, 1, luts_in_this_call, allocate_gpu_memory, size_tracker); for (uint32_t j = 0; j < luts_in_this_call; j++) { @@ -236,51 +246,56 @@ template struct int_possible_results_buffer { params.message_modulus, params.carry_modulus, fns, allocate_gpu_memory); - current_lut->broadcast_lut(sub_streams_vec[s].active_gpu_subset(1)); - stream_luts_vec.push_back(current_lut); + current_lut->broadcast_lut(sub_streams[s].active_gpu_subset(1)); + stream_luts[lut_count++] = current_lut; lut_value_start += luts_in_this_call; } } fns.clear(); - this->tmp_many_luts_output_vec.resize(this->num_streams); + this->tmp_many_luts_output = + new CudaRadixCiphertextFFI *[this->num_streams]; for (uint32_t j = 0; j < this->num_streams; j++) { - this->tmp_many_luts_output_vec[j] = new CudaRadixCiphertextFFI; + this->tmp_many_luts_output[j] = new CudaRadixCiphertextFFI; create_zero_radix_ciphertext_async( streams.stream(0), streams.gpu_index(0), - this->tmp_many_luts_output_vec[j], max_luts_per_call, + this->tmp_many_luts_output[j], max_luts_per_call, params.big_lwe_dimension, size_tracker, allocate_gpu_memory); } } void release(CudaStreams streams) { - for (auto lut : stream_luts_vec) { - lut->release(streams); - delete lut; + for (uint32_t i = 0; i < num_streams * num_lut_accumulators; i++) { + stream_luts[i]->release(streams); + delete stream_luts[i]; } - stream_luts_vec.clear(); + delete[] stream_luts; - for (auto ct : this->tmp_many_luts_output_vec) { + for (uint32_t i = 0; i < this->num_streams; i++) { release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), - ct, this->allocate_gpu_memory); - delete ct; + this->tmp_many_luts_output[i], + this->allocate_gpu_memory); + delete this->tmp_many_luts_output[i]; } - this->tmp_many_luts_output_vec.clear(); + delete[] this->tmp_many_luts_output; cuda_event_destroy(incoming_event, streams.gpu_index(0)); + + uint32_t num_gpus = active_streams.count(); for (uint j = 0; j < num_streams; j++) { - for (uint k = 0; k < active_streams.count(); k++) { - cuda_event_destroy(outgoing_events[j][k], active_streams.gpu_index(k)); + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); } } - outgoing_events.clear(); + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); - - for (auto &stream : sub_streams_vec) { - stream.release(); - } - sub_streams_vec.clear(); } }; @@ -289,23 +304,23 @@ template struct int_aggregate_one_hot_buffer { bool allocate_gpu_memory; uint32_t chunk_size; - std::vector *> stream_identity_luts; + int_radix_lut **stream_identity_luts; int_radix_lut *message_extract_lut; int_radix_lut *carry_extract_lut; CudaStreams active_streams; - std::vector sub_streams_vec; + CudaStreams *sub_streams; cudaEvent_t incoming_event; - std::vector> outgoing_events; + cudaEvent_t *outgoing_events; cudaEvent_t reduction_done_event; - std::vector message_done_events; - std::vector carry_done_events; + cudaEvent_t *message_done_events; + cudaEvent_t *carry_done_events; uint32_t num_streams; - std::vector partial_aggregated_vectors; - std::vector partial_temp_vectors; + CudaRadixCiphertextFFI **partial_aggregated_vectors; + CudaRadixCiphertextFFI **partial_temp_vectors; CudaRadixCiphertextFFI *message_ct; CudaRadixCiphertextFFI *carry_ct; @@ -327,37 +342,37 @@ template struct int_aggregate_one_hot_buffer { this->num_streams = num_streams_to_use; this->active_streams = streams.active_gpu_subset(num_blocks); + uint32_t num_gpus = active_streams.count(); this->incoming_event = cuda_create_event(streams.gpu_index(0)); this->reduction_done_event = cuda_create_event(streams.gpu_index(0)); - this->message_done_events.resize(active_streams.count()); - this->carry_done_events.resize(active_streams.count()); - for (uint32_t i = 0; i < active_streams.count(); i++) { + this->message_done_events = new cudaEvent_t[num_gpus]; + this->carry_done_events = new cudaEvent_t[num_gpus]; + for (uint32_t i = 0; i < num_gpus; i++) { this->message_done_events[i] = cuda_create_event(active_streams.gpu_index(i)); this->carry_done_events[i] = cuda_create_event(active_streams.gpu_index(i)); } - this->sub_streams_vec.resize(num_streams); - this->outgoing_events.resize(num_streams); + this->sub_streams = new CudaStreams[num_streams]; + this->outgoing_events = new cudaEvent_t[num_streams * num_gpus]; for (uint32_t i = 0; i < num_streams; i++) { - this->sub_streams_vec[i].create_on_same_gpus(active_streams); - this->outgoing_events[i].resize(active_streams.count()); - for (uint32_t j = 0; j < active_streams.count(); j++) { - this->outgoing_events[i][j] = + this->sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < num_gpus; j++) { + this->outgoing_events[i * num_gpus + j] = cuda_create_event(active_streams.gpu_index(j)); } } - this->stream_identity_luts.reserve(num_streams); + this->stream_identity_luts = new int_radix_lut *[num_streams]; std::function id_fn = [](Torus x) -> Torus { return x; }; for (uint32_t i = 0; i < num_streams; i++) { int_radix_lut *lut = - new int_radix_lut(sub_streams_vec[i], params, 1, num_blocks, + new int_radix_lut(sub_streams[i], params, 1, num_blocks, allocate_gpu_memory, size_tracker); generate_device_accumulator( @@ -366,8 +381,8 @@ template struct int_aggregate_one_hot_buffer { params.polynomial_size, params.message_modulus, params.carry_modulus, id_fn, allocate_gpu_memory); - lut->broadcast_lut(sub_streams_vec[i].active_gpu_subset(num_blocks)); - this->stream_identity_luts.push_back(lut); + lut->broadcast_lut(sub_streams[i].active_gpu_subset(num_blocks)); + this->stream_identity_luts[i] = lut; } std::function msg_fn = [params](Torus x) -> Torus { @@ -378,7 +393,7 @@ template struct int_aggregate_one_hot_buffer { }; this->message_extract_lut = - new int_radix_lut(sub_streams_vec[0], params, 1, num_blocks, + new int_radix_lut(sub_streams[0], params, 1, num_blocks, allocate_gpu_memory, size_tracker); generate_device_accumulator( streams.stream(0), streams.gpu_index(0), @@ -388,10 +403,10 @@ template struct int_aggregate_one_hot_buffer { params.polynomial_size, params.message_modulus, params.carry_modulus, msg_fn, allocate_gpu_memory); this->message_extract_lut->broadcast_lut( - sub_streams_vec[0].active_gpu_subset(num_blocks)); + sub_streams[0].active_gpu_subset(num_blocks)); this->carry_extract_lut = - new int_radix_lut(sub_streams_vec[1], params, 1, num_blocks, + new int_radix_lut(sub_streams[1], params, 1, num_blocks, allocate_gpu_memory, size_tracker); generate_device_accumulator( streams.stream(0), streams.gpu_index(0), @@ -401,10 +416,11 @@ template struct int_aggregate_one_hot_buffer { params.polynomial_size, params.message_modulus, params.carry_modulus, carry_fn, allocate_gpu_memory); this->carry_extract_lut->broadcast_lut( - sub_streams_vec[1].active_gpu_subset(num_blocks)); + sub_streams[1].active_gpu_subset(num_blocks)); - this->partial_aggregated_vectors.resize(num_streams); - this->partial_temp_vectors.resize(num_streams); + this->partial_aggregated_vectors = + new CudaRadixCiphertextFFI *[num_streams]; + this->partial_temp_vectors = new CudaRadixCiphertextFFI *[num_streams]; for (uint32_t i = 0; i < num_streams; i++) { this->partial_aggregated_vectors[i] = new CudaRadixCiphertextFFI; @@ -433,11 +449,11 @@ template struct int_aggregate_one_hot_buffer { } void release(CudaStreams streams) { - for (auto lut : stream_identity_luts) { - lut->release(streams); - delete lut; + for (uint32_t i = 0; i < num_streams; i++) { + stream_identity_luts[i]->release(streams); + delete stream_identity_luts[i]; } - stream_identity_luts.clear(); + delete[] stream_identity_luts; this->message_extract_lut->release(streams); delete this->message_extract_lut; @@ -446,17 +462,17 @@ template struct int_aggregate_one_hot_buffer { for (uint32_t i = 0; i < num_streams; i++) { release_radix_ciphertext_async( - sub_streams_vec[i].stream(0), sub_streams_vec[i].gpu_index(0), + sub_streams[i].stream(0), sub_streams[i].gpu_index(0), this->partial_aggregated_vectors[i], this->allocate_gpu_memory); delete this->partial_aggregated_vectors[i]; release_radix_ciphertext_async( - sub_streams_vec[i].stream(0), sub_streams_vec[i].gpu_index(0), + sub_streams[i].stream(0), sub_streams[i].gpu_index(0), this->partial_temp_vectors[i], this->allocate_gpu_memory); delete this->partial_temp_vectors[i]; } - partial_aggregated_vectors.clear(); - partial_temp_vectors.clear(); + delete[] partial_aggregated_vectors; + delete[] partial_temp_vectors; release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), this->message_ct, this->allocate_gpu_memory); @@ -467,24 +483,28 @@ template struct int_aggregate_one_hot_buffer { cuda_event_destroy(incoming_event, streams.gpu_index(0)); cuda_event_destroy(reduction_done_event, streams.gpu_index(0)); - for (uint i = 0; i < active_streams.count(); i++) { + uint32_t num_gpus = active_streams.count(); + for (uint i = 0; i < num_gpus; i++) { cuda_event_destroy(message_done_events[i], active_streams.gpu_index(i)); cuda_event_destroy(carry_done_events[i], active_streams.gpu_index(i)); } + delete[] message_done_events; + delete[] carry_done_events; for (uint j = 0; j < num_streams; j++) { - for (uint k = 0; k < active_streams.count(); k++) { - cuda_event_destroy(outgoing_events[j][k], active_streams.gpu_index(k)); + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); } } - outgoing_events.clear(); + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); - - for (auto &stream : sub_streams_vec) { - stream.release(); - } - sub_streams_vec.clear(); } }; @@ -683,3 +703,1028 @@ template struct int_unchecked_match_value_or_buffer { cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } }; + +template struct int_unchecked_contains_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_inputs; + + int_comparison_buffer **eq_buffers; + int_comparison_buffer *reduction_buffer; + + CudaRadixCiphertextFFI *packed_selectors; + + CudaStreams active_streams; + CudaStreams *sub_streams; + cudaEvent_t incoming_event; + cudaEvent_t *outgoing_events; + uint32_t num_streams; + + int_unchecked_contains_buffer(CudaStreams streams, int_radix_params params, + uint32_t num_inputs, uint32_t num_blocks, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_inputs = num_inputs; + + uint32_t num_streams_to_use = + std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs); + if (num_streams_to_use == 0) + num_streams_to_use = 1; + + this->num_streams = num_streams_to_use; + this->active_streams = streams.active_gpu_subset(num_blocks); + uint32_t num_gpus = active_streams.count(); + + incoming_event = cuda_create_event(streams.gpu_index(0)); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus]; + + for (uint32_t i = 0; i < num_streams_to_use; i++) { + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < num_gpus; j++) { + outgoing_events[i * num_gpus + j] = + cuda_create_event(active_streams.gpu_index(j)); + } + } + + this->eq_buffers = new int_comparison_buffer *[num_streams]; + for (uint32_t i = 0; i < num_streams; i++) { + this->eq_buffers[i] = new int_comparison_buffer( + sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory, + size_tracker); + } + + this->reduction_buffer = + new int_comparison_buffer(streams, EQ, params, num_inputs, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_inputs, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + } + + void release(CudaStreams streams) { + for (uint32_t i = 0; i < num_streams; i++) { + eq_buffers[i]->release(streams); + delete eq_buffers[i]; + } + delete[] eq_buffers; + + this->reduction_buffer->release(streams); + delete this->reduction_buffer; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + cuda_event_destroy(incoming_event, streams.gpu_index(0)); + + uint32_t num_gpus = active_streams.count(); + for (uint j = 0; j < num_streams; j++) { + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); + } + } + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + } +}; + +template struct int_unchecked_contains_clear_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_inputs; + + int_comparison_buffer **eq_buffers; + int_comparison_buffer *reduction_buffer; + + CudaRadixCiphertextFFI *packed_selectors; + CudaRadixCiphertextFFI *tmp_clear_val; + Torus *d_clear_val; + + CudaStreams active_streams; + CudaStreams *sub_streams; + cudaEvent_t incoming_event; + cudaEvent_t *outgoing_events; + uint32_t num_streams; + + int_unchecked_contains_clear_buffer(CudaStreams streams, + int_radix_params params, + uint32_t num_inputs, uint32_t num_blocks, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_inputs = num_inputs; + + uint32_t num_streams_to_use = + std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs); + if (num_streams_to_use == 0) + num_streams_to_use = 1; + + this->num_streams = num_streams_to_use; + this->active_streams = streams.active_gpu_subset(num_blocks); + uint32_t num_gpus = active_streams.count(); + + incoming_event = cuda_create_event(streams.gpu_index(0)); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus]; + + for (uint32_t i = 0; i < num_streams_to_use; i++) { + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < num_gpus; j++) { + outgoing_events[i * num_gpus + j] = + cuda_create_event(active_streams.gpu_index(j)); + } + } + + this->eq_buffers = new int_comparison_buffer *[num_streams]; + for (uint32_t i = 0; i < num_streams; i++) { + this->eq_buffers[i] = new int_comparison_buffer( + sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory, + size_tracker); + } + + this->reduction_buffer = + new int_comparison_buffer(streams, EQ, params, num_inputs, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_inputs, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->tmp_clear_val = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->tmp_clear_val, + num_blocks, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async( + num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0), + size_tracker, allocate_gpu_memory); + } + + void release(CudaStreams streams) { + for (uint32_t i = 0; i < num_streams; i++) { + eq_buffers[i]->release(streams); + delete eq_buffers[i]; + } + delete[] eq_buffers; + + this->reduction_buffer->release(streams); + delete this->reduction_buffer; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->tmp_clear_val, + this->allocate_gpu_memory); + delete this->tmp_clear_val; + + cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0)); + + cuda_event_destroy(incoming_event, streams.gpu_index(0)); + + uint32_t num_gpus = active_streams.count(); + for (uint j = 0; j < num_streams; j++) { + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); + } + } + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + } +}; + +template struct int_unchecked_is_in_clears_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_clears; + + int_equality_selectors_buffer *eq_buffer; + int_comparison_buffer *reduction_buffer; + + CudaRadixCiphertextFFI *packed_selectors; + CudaRadixCiphertextFFI *unpacked_selectors; + + int_unchecked_is_in_clears_buffer(CudaStreams streams, + int_radix_params params, + uint32_t num_clears, uint32_t num_blocks, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_clears = num_clears; + + this->eq_buffer = new int_equality_selectors_buffer( + streams, params, num_clears, num_blocks, allocate_gpu_memory, + size_tracker); + + this->reduction_buffer = + new int_comparison_buffer(streams, EQ, params, num_clears, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_clears, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->unpacked_selectors = new CudaRadixCiphertextFFI[num_clears]; + + for (uint32_t i = 0; i < num_clears; i++) { + as_radix_ciphertext_slice(&this->unpacked_selectors[i], + this->packed_selectors, i, i + 1); + } + } + + void release(CudaStreams streams) { + this->eq_buffer->release(streams); + delete this->eq_buffer; + + this->reduction_buffer->release(streams); + delete this->reduction_buffer; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + delete[] this->unpacked_selectors; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + } +}; + +template struct int_final_index_from_selectors_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_inputs; + + int_possible_results_buffer *possible_results_buf; + int_aggregate_one_hot_buffer *aggregate_buf; + int_comparison_buffer *reduction_buf; + + CudaRadixCiphertextFFI *packed_selectors; + CudaRadixCiphertextFFI *unpacked_selectors; + CudaRadixCiphertextFFI *possible_results_ct_list; + + uint64_t *h_indices; + + int_final_index_from_selectors_buffer(CudaStreams streams, + int_radix_params params, + uint32_t num_inputs, + uint32_t num_blocks_index, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_inputs = num_inputs; + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + this->possible_results_buf = new int_possible_results_buffer( + streams, params, packed_len, num_inputs, allocate_gpu_memory, + size_tracker); + + this->aggregate_buf = new int_aggregate_one_hot_buffer( + streams, params, packed_len, num_inputs, allocate_gpu_memory, + size_tracker); + + this->reduction_buf = + new int_comparison_buffer(streams, EQ, params, num_inputs, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_inputs, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs]; + for (uint32_t i = 0; i < num_inputs; i++) { + as_radix_ciphertext_slice(&this->unpacked_selectors[i], + this->packed_selectors, i, i + 1); + } + + this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs]; + for (uint32_t i = 0; i < num_inputs; i++) { + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], packed_len, + params.big_lwe_dimension, size_tracker, allocate_gpu_memory); + } + + uint32_t num_bits_in_message = log2_int(params.message_modulus); + uint32_t bits_per_packed_block = 2 * num_bits_in_message; + + h_indices = new uint64_t[num_inputs * packed_len]; + for (uint32_t i = 0; i < num_inputs; i++) { + uint64_t val = i; + for (uint32_t b = 0; b < packed_len; b++) { + uint64_t mask = (1ULL << bits_per_packed_block) - 1; + uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask; + h_indices[i * packed_len + b] = block_val; + } + } + } + + void release(CudaStreams streams) { + this->possible_results_buf->release(streams); + delete this->possible_results_buf; + + this->aggregate_buf->release(streams); + delete this->aggregate_buf; + + this->reduction_buf->release(streams); + delete this->reduction_buf; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + delete[] this->unpacked_selectors; + + for (uint32_t i = 0; i < num_inputs; i++) { + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], + this->allocate_gpu_memory); + } + delete[] this->possible_results_ct_list; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + + delete[] h_indices; + } +}; + +template struct int_unchecked_index_in_clears_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_clears; + + int_equality_selectors_buffer *eq_selectors_buf; + int_final_index_from_selectors_buffer *final_index_buf; + + int_unchecked_index_in_clears_buffer(CudaStreams streams, + int_radix_params params, + uint32_t num_clears, uint32_t num_blocks, + uint32_t num_blocks_index, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_clears = num_clears; + + this->eq_selectors_buf = new int_equality_selectors_buffer( + streams, params, num_clears, num_blocks, allocate_gpu_memory, + size_tracker); + + this->final_index_buf = new int_final_index_from_selectors_buffer( + streams, params, num_clears, num_blocks_index, allocate_gpu_memory, + size_tracker); + } + + void release(CudaStreams streams) { + this->eq_selectors_buf->release(streams); + delete this->eq_selectors_buf; + + this->final_index_buf->release(streams); + delete this->final_index_buf; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + } +}; + +template struct int_unchecked_first_index_in_clears_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_unique; + + int_equality_selectors_buffer *eq_selectors_buf; + int_possible_results_buffer *possible_results_buf; + int_aggregate_one_hot_buffer *aggregate_buf; + int_comparison_buffer *reduction_buf; + + CudaRadixCiphertextFFI *packed_selectors; + CudaRadixCiphertextFFI *unpacked_selectors; + CudaRadixCiphertextFFI *possible_results_ct_list; + + int_unchecked_first_index_in_clears_buffer( + CudaStreams streams, int_radix_params params, uint32_t num_unique, + uint32_t num_blocks, uint32_t num_blocks_index, bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_unique = num_unique; + + this->eq_selectors_buf = new int_equality_selectors_buffer( + streams, params, num_unique, num_blocks, allocate_gpu_memory, + size_tracker); + + uint32_t packed_len = (num_blocks_index + 1) / 2; + this->possible_results_buf = new int_possible_results_buffer( + streams, params, packed_len, num_unique, allocate_gpu_memory, + size_tracker); + + this->aggregate_buf = new int_aggregate_one_hot_buffer( + streams, params, packed_len, num_unique, allocate_gpu_memory, + size_tracker); + + this->reduction_buf = + new int_comparison_buffer(streams, EQ, params, num_unique, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_unique, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->unpacked_selectors = new CudaRadixCiphertextFFI[num_unique]; + for (uint32_t i = 0; i < num_unique; i++) { + as_radix_ciphertext_slice(&this->unpacked_selectors[i], + this->packed_selectors, i, i + 1); + } + + this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_unique]; + for (uint32_t i = 0; i < num_unique; i++) { + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], packed_len, + params.big_lwe_dimension, size_tracker, allocate_gpu_memory); + } + } + + void release(CudaStreams streams) { + this->eq_selectors_buf->release(streams); + delete this->eq_selectors_buf; + + this->possible_results_buf->release(streams); + delete this->possible_results_buf; + + this->aggregate_buf->release(streams); + delete this->aggregate_buf; + + this->reduction_buf->release(streams); + delete this->reduction_buf; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + delete[] this->unpacked_selectors; + + for (uint32_t i = 0; i < num_unique; i++) { + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], + this->allocate_gpu_memory); + } + delete[] this->possible_results_ct_list; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + } +}; + +template struct int_unchecked_first_index_of_clear_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_inputs; + + int_comparison_buffer **eq_buffers; + int_possible_results_buffer *possible_results_buf; + int_aggregate_one_hot_buffer *aggregate_buf; + int_comparison_buffer *reduction_buf; + + CudaRadixCiphertextFFI *packed_selectors; + CudaRadixCiphertextFFI *unpacked_selectors; + CudaRadixCiphertextFFI *possible_results_ct_list; + CudaRadixCiphertextFFI *tmp_clear_val; + Torus *d_clear_val; + uint64_t *h_indices; + + int_radix_lut *prefix_sum_lut; + int_radix_lut *cleanup_lut; + + CudaStreams active_streams; + CudaStreams *sub_streams; + cudaEvent_t incoming_event; + cudaEvent_t *outgoing_events; + uint32_t num_streams; + + int_unchecked_first_index_of_clear_buffer( + CudaStreams streams, int_radix_params params, uint32_t num_inputs, + uint32_t num_blocks, uint32_t num_blocks_index, bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_inputs = num_inputs; + + uint32_t num_streams_to_use = + std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs); + if (num_streams_to_use == 0) + num_streams_to_use = 1; + + this->num_streams = num_streams_to_use; + this->active_streams = streams.active_gpu_subset(num_blocks); + + incoming_event = cuda_create_event(streams.gpu_index(0)); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = + new cudaEvent_t[num_streams_to_use * active_streams.count()]; + + for (uint32_t i = 0; i < num_streams_to_use; i++) { + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < active_streams.count(); j++) { + outgoing_events[i * active_streams.count() + j] = + cuda_create_event(active_streams.gpu_index(j)); + } + } + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + this->eq_buffers = new int_comparison_buffer *[num_streams]; + for (uint32_t i = 0; i < num_streams; i++) { + this->eq_buffers[i] = new int_comparison_buffer( + sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory, + size_tracker); + } + + this->possible_results_buf = new int_possible_results_buffer( + streams, params, packed_len, num_inputs, allocate_gpu_memory, + size_tracker); + + this->aggregate_buf = new int_aggregate_one_hot_buffer( + streams, params, packed_len, num_inputs, allocate_gpu_memory, + size_tracker); + + this->reduction_buf = + new int_comparison_buffer(streams, EQ, params, num_inputs, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_inputs, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs]; + for (uint32_t i = 0; i < num_inputs; i++) { + as_radix_ciphertext_slice(&this->unpacked_selectors[i], + this->packed_selectors, i, i + 1); + } + + this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs]; + for (uint32_t i = 0; i < num_inputs; i++) { + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], packed_len, + params.big_lwe_dimension, size_tracker, allocate_gpu_memory); + } + + this->tmp_clear_val = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->tmp_clear_val, + num_blocks, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async( + num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0), + size_tracker, allocate_gpu_memory); + + h_indices = nullptr; + if (allocate_gpu_memory) { + uint32_t num_bits_in_message = log2_int(params.message_modulus); + uint32_t bits_per_packed_block = 2 * num_bits_in_message; + + h_indices = new uint64_t[num_inputs * packed_len]; + for (uint32_t i = 0; i < num_inputs; i++) { + uint64_t val = i; + for (uint32_t b = 0; b < packed_len; b++) { + uint64_t mask = (1ULL << bits_per_packed_block) - 1; + uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask; + h_indices[i * packed_len + b] = block_val; + } + } + } + + const Torus ALREADY_SEEN = 2; + auto prefix_sum_fn = [ALREADY_SEEN](Torus current, + Torus previous) -> Torus { + if (previous == 1 || previous == ALREADY_SEEN) { + return ALREADY_SEEN; + } + return current; + }; + this->prefix_sum_lut = new int_radix_lut( + streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker); + + generate_device_accumulator_bivariate( + streams.stream(0), streams.gpu_index(0), + this->prefix_sum_lut->get_lut(0, 0), + this->prefix_sum_lut->get_degree(0), + this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + prefix_sum_fn, allocate_gpu_memory); + this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs)); + + auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus { + Torus val = x % params.message_modulus; + if (val == ALREADY_SEEN) + return 0; + return val; + }; + this->cleanup_lut = new int_radix_lut( + streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker); + generate_device_accumulator( + streams.stream(0), streams.gpu_index(0), + this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0), + this->cleanup_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + cleanup_fn, allocate_gpu_memory); + this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs)); + } + + void release(CudaStreams streams) { + for (uint32_t i = 0; i < num_streams; i++) { + eq_buffers[i]->release(streams); + delete eq_buffers[i]; + } + delete[] eq_buffers; + + this->possible_results_buf->release(streams); + delete this->possible_results_buf; + + this->aggregate_buf->release(streams); + delete this->aggregate_buf; + + this->reduction_buf->release(streams); + delete this->reduction_buf; + + this->prefix_sum_lut->release(streams); + delete this->prefix_sum_lut; + + this->cleanup_lut->release(streams); + delete this->cleanup_lut; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + delete[] this->unpacked_selectors; + + for (uint32_t i = 0; i < num_inputs; i++) { + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], + this->allocate_gpu_memory); + } + delete[] this->possible_results_ct_list; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->tmp_clear_val, + this->allocate_gpu_memory); + delete this->tmp_clear_val; + + cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0)); + + cuda_event_destroy(incoming_event, streams.gpu_index(0)); + + uint32_t num_gpus = active_streams.count(); + for (uint j = 0; j < num_streams; j++) { + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); + } + } + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + + delete[] h_indices; + } +}; + +template struct int_unchecked_first_index_of_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_inputs; + + int_comparison_buffer **eq_buffers; + int_possible_results_buffer *possible_results_buf; + int_aggregate_one_hot_buffer *aggregate_buf; + int_comparison_buffer *reduction_buf; + + CudaRadixCiphertextFFI *packed_selectors; + CudaRadixCiphertextFFI *unpacked_selectors; + CudaRadixCiphertextFFI *possible_results_ct_list; + uint64_t *h_indices; + + int_radix_lut *prefix_sum_lut; + int_radix_lut *cleanup_lut; + + CudaStreams active_streams; + CudaStreams *sub_streams; + cudaEvent_t incoming_event; + cudaEvent_t *outgoing_events; + uint32_t num_streams; + + int_unchecked_first_index_of_buffer(CudaStreams streams, + int_radix_params params, + uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_inputs = num_inputs; + + uint32_t num_streams_to_use = + std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs); + if (num_streams_to_use == 0) + num_streams_to_use = 1; + + this->num_streams = num_streams_to_use; + this->active_streams = streams.active_gpu_subset(num_blocks); + + incoming_event = cuda_create_event(streams.gpu_index(0)); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = + new cudaEvent_t[num_streams_to_use * active_streams.count()]; + + for (uint32_t i = 0; i < num_streams_to_use; i++) { + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < active_streams.count(); j++) { + outgoing_events[i * active_streams.count() + j] = + cuda_create_event(active_streams.gpu_index(j)); + } + } + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + this->eq_buffers = new int_comparison_buffer *[num_streams]; + for (uint32_t i = 0; i < num_streams; i++) { + this->eq_buffers[i] = new int_comparison_buffer( + sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory, + size_tracker); + } + + this->possible_results_buf = new int_possible_results_buffer( + streams, params, packed_len, num_inputs, allocate_gpu_memory, + size_tracker); + + this->aggregate_buf = new int_aggregate_one_hot_buffer( + streams, params, packed_len, num_inputs, allocate_gpu_memory, + size_tracker); + + this->reduction_buf = + new int_comparison_buffer(streams, EQ, params, num_inputs, false, + allocate_gpu_memory, size_tracker); + + this->packed_selectors = new CudaRadixCiphertextFFI; + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), this->packed_selectors, + num_inputs, params.big_lwe_dimension, size_tracker, + allocate_gpu_memory); + + this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs]; + for (uint32_t i = 0; i < num_inputs; i++) { + as_radix_ciphertext_slice(&this->unpacked_selectors[i], + this->packed_selectors, i, i + 1); + } + + this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs]; + for (uint32_t i = 0; i < num_inputs; i++) { + create_zero_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], packed_len, + params.big_lwe_dimension, size_tracker, allocate_gpu_memory); + } + + h_indices = nullptr; + if (allocate_gpu_memory) { + uint32_t num_bits_in_message = log2_int(params.message_modulus); + uint32_t bits_per_packed_block = 2 * num_bits_in_message; + + h_indices = new uint64_t[num_inputs * packed_len]; + for (uint32_t i = 0; i < num_inputs; i++) { + uint64_t val = i; + for (uint32_t b = 0; b < packed_len; b++) { + uint64_t mask = (1ULL << bits_per_packed_block) - 1; + uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask; + h_indices[i * packed_len + b] = block_val; + } + } + } + + const Torus ALREADY_SEEN = 2; + auto prefix_sum_fn = [ALREADY_SEEN](Torus current, + Torus previous) -> Torus { + if (previous == 1 || previous == ALREADY_SEEN) { + return ALREADY_SEEN; + } + return current; + }; + this->prefix_sum_lut = new int_radix_lut( + streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker); + + generate_device_accumulator_bivariate( + streams.stream(0), streams.gpu_index(0), + this->prefix_sum_lut->get_lut(0, 0), + this->prefix_sum_lut->get_degree(0), + this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + prefix_sum_fn, allocate_gpu_memory); + this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs)); + + auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus { + Torus val = x % params.message_modulus; + if (val == ALREADY_SEEN) + return 0; + return val; + }; + this->cleanup_lut = new int_radix_lut( + streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker); + generate_device_accumulator( + streams.stream(0), streams.gpu_index(0), + this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0), + this->cleanup_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + cleanup_fn, allocate_gpu_memory); + this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs)); + } + + void release(CudaStreams streams) { + for (uint32_t i = 0; i < num_streams; i++) { + eq_buffers[i]->release(streams); + delete eq_buffers[i]; + } + delete[] eq_buffers; + + this->possible_results_buf->release(streams); + delete this->possible_results_buf; + + this->aggregate_buf->release(streams); + delete this->aggregate_buf; + + this->reduction_buf->release(streams); + delete this->reduction_buf; + + this->prefix_sum_lut->release(streams); + delete this->prefix_sum_lut; + + this->cleanup_lut->release(streams); + delete this->cleanup_lut; + + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + this->packed_selectors, + this->allocate_gpu_memory); + delete this->packed_selectors; + + delete[] this->unpacked_selectors; + + for (uint32_t i = 0; i < num_inputs; i++) { + release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0), + &this->possible_results_ct_list[i], + this->allocate_gpu_memory); + } + delete[] this->possible_results_ct_list; + + cuda_event_destroy(incoming_event, streams.gpu_index(0)); + + uint32_t num_gpus = active_streams.count(); + for (uint j = 0; j < num_streams; j++) { + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); + } + } + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + + delete[] h_indices; + } +}; + +template struct int_unchecked_index_of_buffer { + int_radix_params params; + bool allocate_gpu_memory; + uint32_t num_inputs; + + int_comparison_buffer **eq_buffers; + int_final_index_from_selectors_buffer *final_index_buf; + + CudaStreams active_streams; + CudaStreams *sub_streams; + cudaEvent_t incoming_event; + cudaEvent_t *outgoing_events; + uint32_t num_streams; + + int_unchecked_index_of_buffer(CudaStreams streams, int_radix_params params, + uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, + bool allocate_gpu_memory, + uint64_t &size_tracker) { + this->params = params; + this->allocate_gpu_memory = allocate_gpu_memory; + this->num_inputs = num_inputs; + + uint32_t num_streams_to_use = + std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs); + if (num_streams_to_use == 0) + num_streams_to_use = 1; + + this->num_streams = num_streams_to_use; + this->active_streams = streams.active_gpu_subset(num_blocks); + + incoming_event = cuda_create_event(streams.gpu_index(0)); + sub_streams = new CudaStreams[num_streams_to_use]; + outgoing_events = + new cudaEvent_t[num_streams_to_use * active_streams.count()]; + + for (uint32_t i = 0; i < num_streams_to_use; i++) { + sub_streams[i].create_on_same_gpus(active_streams); + for (uint32_t j = 0; j < active_streams.count(); j++) { + outgoing_events[i * active_streams.count() + j] = + cuda_create_event(active_streams.gpu_index(j)); + } + } + + this->eq_buffers = new int_comparison_buffer *[num_streams]; + for (uint32_t i = 0; i < num_streams; i++) { + this->eq_buffers[i] = new int_comparison_buffer( + sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory, + size_tracker); + } + + this->final_index_buf = new int_final_index_from_selectors_buffer( + streams, params, num_inputs, num_blocks_index, allocate_gpu_memory, + size_tracker); + } + + void release(CudaStreams streams) { + for (uint32_t i = 0; i < num_streams; i++) { + eq_buffers[i]->release(streams); + delete eq_buffers[i]; + } + delete[] eq_buffers; + + this->final_index_buf->release(streams); + delete this->final_index_buf; + + cuda_event_destroy(incoming_event, streams.gpu_index(0)); + + uint32_t num_gpus = active_streams.count(); + for (uint j = 0; j < num_streams; j++) { + for (uint k = 0; k < num_gpus; k++) { + cuda_event_destroy(outgoing_events[j * num_gpus + k], + active_streams.gpu_index(k)); + } + } + delete[] outgoing_events; + + for (uint32_t i = 0; i < num_streams; i++) { + sub_streams[i].release(); + } + delete[] sub_streams; + + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + } +}; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index 250311791..3508cabdb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -284,46 +284,6 @@ void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams, POP_RANGE() } -uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - uint64_t lut_degree, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type) { - - int_radix_params params(pbs_type, glwe_dimension, polynomial_size, - glwe_dimension * polynomial_size, lwe_dimension, - ks_level, ks_base_log, pbs_level, pbs_base_log, - grouping_factor, message_modulus, carry_modulus, - noise_reduction_type); - - return scratch_cuda_apply_bivariate_lut( - CudaStreams(streams), (int_radix_lut **)mem_ptr, - static_cast(input_lut), num_radix_blocks, params, - lut_degree, allocate_gpu_memory); -} - -void cuda_integer_compute_prefix_sum_hillis_steele_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe, - CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr, - void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) { - - host_compute_prefix_sum_hillis_steele( - CudaStreams(streams), output_radix_lwe, generates_or_propagates, - (int_radix_lut *)mem_ptr, bsks, (uint64_t **)(ksks), - num_radix_blocks); -} - -void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - CudaStreamsFFI streams, int8_t **mem_ptr_void) { - int_radix_lut *mem_ptr = (int_radix_lut *)(*mem_ptr_void); - mem_ptr->release(CudaStreams(streams)); - delete mem_ptr; - *mem_ptr_void = nullptr; -} - void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu index f953e5a74..9d74f8981 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu @@ -1,133 +1,5 @@ #include "integer/vector_find.cuh" -uint64_t scratch_cuda_compute_equality_selectors_64( - CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type) { - - int_radix_params params(pbs_type, glwe_dimension, polynomial_size, - big_lwe_dimension, small_lwe_dimension, ks_level, - ks_base_log, pbs_level, pbs_base_log, grouping_factor, - message_modulus, carry_modulus, noise_reduction_type); - - return scratch_cuda_compute_equality_selectors( - CudaStreams(streams), (int_equality_selectors_buffer **)mem_ptr, - params, num_possible_values, num_blocks, allocate_gpu_memory); -} - -void cuda_compute_equality_selectors_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list, - CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks, - const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks, - void *const *ksks) { - - host_compute_equality_selectors( - CudaStreams(streams), lwe_array_out_list, lwe_array_in, num_blocks, - h_decomposed_cleartexts, (int_equality_selectors_buffer *)mem, - bsks, (uint64_t *const *)ksks); -} - -void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams, - int8_t **mem_ptr_void) { - int_equality_selectors_buffer *mem_ptr = - (int_equality_selectors_buffer *)(*mem_ptr_void); - - mem_ptr->release(CudaStreams(streams)); - - delete mem_ptr; - *mem_ptr_void = nullptr; -} - -uint64_t scratch_cuda_create_possible_results_64( - CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type) { - - int_radix_params params(pbs_type, glwe_dimension, polynomial_size, - big_lwe_dimension, small_lwe_dimension, ks_level, - ks_base_log, pbs_level, pbs_base_log, grouping_factor, - message_modulus, carry_modulus, noise_reduction_type); - - return scratch_cuda_create_possible_results( - CudaStreams(streams), (int_possible_results_buffer **)mem_ptr, - params, num_blocks, num_possible_values, allocate_gpu_memory); -} - -void cuda_create_possible_results_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list, - CudaRadixCiphertextFFI const *lwe_array_in_list, - uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts, - uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks) { - - host_create_possible_results( - CudaStreams(streams), lwe_array_out_list, lwe_array_in_list, - num_possible_values, h_decomposed_cleartexts, num_blocks, - (int_possible_results_buffer *)mem, bsks, - (uint64_t *const *)ksks); -} - -void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams, - int8_t **mem_ptr_void) { - int_possible_results_buffer *mem_ptr = - (int_possible_results_buffer *)(*mem_ptr_void); - - mem_ptr->release(CudaStreams(streams)); - - delete mem_ptr; - *mem_ptr_void = nullptr; -} - -uint64_t scratch_cuda_aggregate_one_hot_vector_64( - CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t big_lwe_dimension, - uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, - uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, - uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus, - uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, - PBS_MS_REDUCTION_T noise_reduction_type) { - - int_radix_params params(pbs_type, glwe_dimension, polynomial_size, - big_lwe_dimension, small_lwe_dimension, ks_level, - ks_base_log, pbs_level, pbs_base_log, grouping_factor, - message_modulus, carry_modulus, noise_reduction_type); - - return scratch_cuda_aggregate_one_hot_vector( - CudaStreams(streams), (int_aggregate_one_hot_buffer **)mem_ptr, - params, num_blocks, num_matches, allocate_gpu_memory); -} - -void cuda_aggregate_one_hot_vector_64( - CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out, - CudaRadixCiphertextFFI const *lwe_array_in_list, - uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem, - void *const *bsks, void *const *ksks) { - - host_aggregate_one_hot_vector( - CudaStreams(streams), lwe_array_out, lwe_array_in_list, - num_input_ciphertexts, num_blocks, - (int_aggregate_one_hot_buffer *)mem, bsks, - (uint64_t *const *)ksks); -} - -void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams, - int8_t **mem_ptr_void) { - int_aggregate_one_hot_buffer *mem_ptr = - (int_aggregate_one_hot_buffer *)(*mem_ptr_void); - - mem_ptr->release(CudaStreams(streams)); - - delete mem_ptr; - *mem_ptr_void = nullptr; -} - uint64_t scratch_cuda_unchecked_match_value_64( CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, @@ -221,3 +93,410 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams, delete mem_ptr; *mem_ptr_void = nullptr; } + +uint64_t scratch_cuda_unchecked_contains_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_contains( + CudaStreams(streams), (int_unchecked_contains_buffer **)mem_ptr, + params, num_inputs, num_blocks, allocate_gpu_memory); +} + +void cuda_unchecked_contains_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + int8_t *mem, void *const *bsks, + void *const *ksks) { + + host_unchecked_contains( + CudaStreams(streams), output, inputs, value, num_inputs, num_blocks, + (int_unchecked_contains_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_contains_buffer *mem_ptr = + (int_unchecked_contains_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_contains_clear_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_contains_clear( + CudaStreams(streams), + (int_unchecked_contains_clear_buffer **)mem_ptr, params, + num_inputs, num_blocks, allocate_gpu_memory); +} + +void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *inputs, + const uint64_t *h_clear_val, + uint32_t num_inputs, uint32_t num_blocks, + int8_t *mem, void *const *bsks, + void *const *ksks) { + + host_unchecked_contains_clear( + CudaStreams(streams), output, inputs, h_clear_val, num_inputs, num_blocks, + (int_unchecked_contains_clear_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_contains_clear_buffer *mem_ptr = + (int_unchecked_contains_clear_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_is_in_clears_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_is_in_clears( + CudaStreams(streams), + (int_unchecked_is_in_clears_buffer **)mem_ptr, params, + num_clears, num_blocks, allocate_gpu_memory); +} + +void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *input, + const uint64_t *h_cleartexts, + uint32_t num_clears, uint32_t num_blocks, + int8_t *mem, void *const *bsks, + void *const *ksks) { + + host_unchecked_is_in_clears( + CudaStreams(streams), output, input, h_cleartexts, num_clears, num_blocks, + (int_unchecked_is_in_clears_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_is_in_clears_buffer *mem_ptr = + (int_unchecked_is_in_clears_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_compute_final_index_from_selectors_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_compute_final_index_from_selectors( + CudaStreams(streams), + (int_final_index_from_selectors_buffer **)mem_ptr, params, + num_inputs, num_blocks_index, allocate_gpu_memory); +} + +void cuda_compute_final_index_from_selectors_64( + CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors, + uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks) { + + host_compute_final_index_from_selectors( + CudaStreams(streams), index_ct, match_ct, selectors, num_inputs, + num_blocks_index, (int_final_index_from_selectors_buffer *)mem, + bsks, (uint64_t *const *)ksks); +} + +void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_final_index_from_selectors_buffer *mem_ptr = + (int_final_index_from_selectors_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_index_in_clears_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_index_in_clears( + CudaStreams(streams), + (int_unchecked_index_in_clears_buffer **)mem_ptr, params, + num_clears, num_blocks, num_blocks_index, allocate_gpu_memory); +} + +void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, + CudaRadixCiphertextFFI const *input, + const uint64_t *h_cleartexts, + uint32_t num_clears, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks) { + + host_unchecked_index_in_clears( + CudaStreams(streams), index_ct, match_ct, input, h_cleartexts, num_clears, + num_blocks, num_blocks_index, + (int_unchecked_index_in_clears_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_index_in_clears_buffer *mem_ptr = + (int_unchecked_index_in_clears_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_first_index_in_clears_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_first_index_in_clears( + CudaStreams(streams), + (int_unchecked_first_index_in_clears_buffer **)mem_ptr, params, + num_unique, num_blocks, num_blocks_index, allocate_gpu_memory); +} + +void cuda_unchecked_first_index_in_clears_64( + CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input, + const uint64_t *h_unique_values, const uint64_t *h_unique_indices, + uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index, + int8_t *mem, void *const *bsks, void *const *ksks) { + + host_unchecked_first_index_in_clears( + CudaStreams(streams), index_ct, match_ct, input, h_unique_values, + h_unique_indices, num_unique, num_blocks, num_blocks_index, + (int_unchecked_first_index_in_clears_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_first_index_in_clears_buffer *mem_ptr = + (int_unchecked_first_index_in_clears_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_first_index_of_clear_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_first_index_of_clear( + CudaStreams(streams), + (int_unchecked_first_index_of_clear_buffer **)mem_ptr, params, + num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory); +} + +void cuda_unchecked_first_index_of_clear_64( + CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs, + const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, void *const *bsks, + void *const *ksks) { + + host_unchecked_first_index_of_clear( + CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val, num_inputs, + num_blocks, num_blocks_index, + (int_unchecked_first_index_of_clear_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_first_index_of_clear_buffer *mem_ptr = + (int_unchecked_first_index_of_clear_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_first_index_of_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_first_index_of( + CudaStreams(streams), + (int_unchecked_first_index_of_buffer **)mem_ptr, params, + num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory); +} + +void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks) { + + host_unchecked_first_index_of( + CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs, + num_blocks, num_blocks_index, + (int_unchecked_first_index_of_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_first_index_of_buffer *mem_ptr = + (int_unchecked_first_index_of_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} + +uint64_t scratch_cuda_unchecked_index_of_64( + CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t big_lwe_dimension, + uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, + uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, + uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index, + uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, + bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + + int_radix_params params(pbs_type, glwe_dimension, polynomial_size, + big_lwe_dimension, small_lwe_dimension, ks_level, + ks_base_log, pbs_level, pbs_base_log, grouping_factor, + message_modulus, carry_modulus, noise_reduction_type); + + return scratch_cuda_unchecked_index_of( + CudaStreams(streams), (int_unchecked_index_of_buffer **)mem_ptr, + params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory); +} + +void cuda_unchecked_index_of_64(CudaStreamsFFI streams, + CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, int8_t *mem, + void *const *bsks, void *const *ksks) { + + host_unchecked_index_of( + CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs, + num_blocks, num_blocks_index, + (int_unchecked_index_of_buffer *)mem, bsks, + (uint64_t *const *)ksks); +} + +void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams, + int8_t **mem_ptr_void) { + int_unchecked_index_of_buffer *mem_ptr = + (int_unchecked_index_of_buffer *)(*mem_ptr_void); + + mem_ptr->release(CudaStreams(streams)); + + delete mem_ptr; + *mem_ptr_void = nullptr; +} diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh index ea52d1788..dbf00a593 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh @@ -28,22 +28,23 @@ __host__ void host_compute_equality_selectors( for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { - cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i), + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), mem_ptr->incoming_event, - mem_ptr->sub_streams_vec[j].gpu_index(i)); + mem_ptr->sub_streams[j].gpu_index(i)); } } uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); for (uint32_t i = 0; i < num_possible_values; i++) { uint32_t stream_idx = i % num_streams; - CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx]; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; CudaRadixCiphertextFFI *current_tmp_block_comparisons = - mem_ptr->tmp_block_comparisons_vec[stream_idx]; + mem_ptr->tmp_block_comparisons[stream_idx]; int_comparison_buffer *current_reduction_buffer = mem_ptr->reduction_buffers[stream_idx]; @@ -75,10 +76,11 @@ __host__ void host_compute_equality_selectors( for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { - cuda_event_record(mem_ptr->outgoing_events[j][i], - mem_ptr->sub_streams_vec[j].stream(i), - mem_ptr->sub_streams_vec[j].gpu_index(i)); - cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i], + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], streams.gpu_index(0)); } } @@ -110,24 +112,25 @@ __host__ void host_create_possible_results( uint32_t max_luts_per_call = mem_ptr->max_luts_per_call; uint32_t num_lut_accumulators = mem_ptr->num_lut_accumulators; uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); cuda_event_record(mem_ptr->incoming_event, streams.stream(0), streams.gpu_index(0)); for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { - cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i), + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), mem_ptr->incoming_event, - mem_ptr->sub_streams_vec[j].gpu_index(i)); + mem_ptr->sub_streams[j].gpu_index(i)); } } for (uint32_t i = 0; i < num_possible_values; i++) { uint32_t stream_idx = i % num_streams; - CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx]; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; CudaRadixCiphertextFFI *current_tmp_buffer = - mem_ptr->tmp_many_luts_output_vec[stream_idx]; + mem_ptr->tmp_many_luts_output[stream_idx]; CudaRadixCiphertextFFI const *current_selector = &lwe_array_in_list[i]; CudaRadixCiphertextFFI *current_output = &lwe_array_out_list[i]; @@ -138,7 +141,7 @@ __host__ void host_create_possible_results( uint32_t lut_index = stream_idx * num_lut_accumulators + k; - int_radix_lut *current_lut = mem_ptr->stream_luts_vec[lut_index]; + int_radix_lut *current_lut = mem_ptr->stream_luts[lut_index]; uint32_t luts_in_this_call = current_lut->num_many_lut; @@ -172,10 +175,11 @@ __host__ void host_create_possible_results( for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { - cuda_event_record(mem_ptr->outgoing_events[j][i], - mem_ptr->sub_streams_vec[j].stream(i), - mem_ptr->sub_streams_vec[j].gpu_index(i)); - cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i], + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], streams.gpu_index(0)); } } @@ -206,15 +210,16 @@ __host__ void host_aggregate_one_hot_vector( int_radix_params params = mem_ptr->params; uint32_t chunk_size = mem_ptr->chunk_size; uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); cuda_event_record(mem_ptr->incoming_event, streams.stream(0), streams.gpu_index(0)); for (uint32_t s = 0; s < num_streams; s++) { for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { - cuda_stream_wait_event(mem_ptr->sub_streams_vec[s].stream(i), + cuda_stream_wait_event(mem_ptr->sub_streams[s].stream(i), mem_ptr->incoming_event, - mem_ptr->sub_streams_vec[s].gpu_index(i)); + mem_ptr->sub_streams[s].gpu_index(i)); } } @@ -223,7 +228,7 @@ __host__ void host_aggregate_one_hot_vector( for (uint32_t s = 0; s < num_streams; s++) { - CudaStreams current_stream = mem_ptr->sub_streams_vec[s]; + CudaStreams current_stream = mem_ptr->sub_streams[s]; CudaRadixCiphertextFFI *current_agg = mem_ptr->partial_aggregated_vectors[s]; @@ -287,10 +292,11 @@ __host__ void host_aggregate_one_hot_vector( for (uint32_t s = 0; s < num_streams; s++) { for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { - cuda_event_record(mem_ptr->outgoing_events[s][i], - mem_ptr->sub_streams_vec[s].stream(i), - mem_ptr->sub_streams_vec[s].gpu_index(i)); - cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[s][i], + cuda_event_record(mem_ptr->outgoing_events[s * num_gpus + i], + mem_ptr->sub_streams[s].stream(i), + mem_ptr->sub_streams[s].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[s * num_gpus + i], streams.gpu_index(0)); } } @@ -322,8 +328,8 @@ __host__ void host_aggregate_one_hot_vector( streams.stream(0), streams.gpu_index(0), temp_agg, 0, num_blocks, final_agg, 0, num_blocks); - CudaStreams message_stream = mem_ptr->sub_streams_vec[0]; - CudaStreams carry_stream = mem_ptr->sub_streams_vec[1]; + CudaStreams message_stream = mem_ptr->sub_streams[0]; + CudaStreams carry_stream = mem_ptr->sub_streams[1]; cuda_event_record(mem_ptr->reduction_done_event, streams.stream(0), streams.gpu_index(0)); @@ -498,3 +504,609 @@ __host__ void host_unchecked_match_value_or( mem_ptr->tmp_match_result, mem_ptr->tmp_or_value, mem_ptr->cmux_buffer, bsks, (Torus **)ksks); } + +template +uint64_t +scratch_cuda_unchecked_contains(CudaStreams streams, + int_unchecked_contains_buffer **mem_ptr, + int_radix_params params, uint32_t num_inputs, + uint32_t num_blocks, bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_contains_buffer( + streams, params, num_inputs, num_blocks, allocate_gpu_memory, + size_tracker); + + return size_tracker; +} + +template +__host__ void +host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, + uint32_t num_inputs, uint32_t num_blocks, + int_unchecked_contains_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks) { + + cuda_event_record(mem_ptr->incoming_event, streams.stream(0), + streams.gpu_index(0)); + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), + mem_ptr->incoming_event, + mem_ptr->sub_streams[j].gpu_index(i)); + } + } + + uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); + + for (uint32_t i = 0; i < num_inputs; i++) { + uint32_t stream_idx = i % num_streams; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; + + CudaRadixCiphertextFFI const *input_ct = &inputs[i]; + + CudaRadixCiphertextFFI current_selector_block; + as_radix_ciphertext_slice(¤t_selector_block, + mem_ptr->packed_selectors, i, i + 1); + + host_equality_check(current_stream, ¤t_selector_block, + input_ct, value, mem_ptr->eq_buffers[stream_idx], + bsks, ksks, num_blocks); + } + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], + streams.gpu_index(0)); + } + } + + host_integer_is_at_least_one_comparisons_block_true( + streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer, + bsks, (Torus **)ksks, num_inputs); +} + +template +uint64_t scratch_cuda_unchecked_contains_clear( + CudaStreams streams, int_unchecked_contains_clear_buffer **mem_ptr, + int_radix_params params, uint32_t num_inputs, uint32_t num_blocks, + bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_contains_clear_buffer( + streams, params, num_inputs, num_blocks, allocate_gpu_memory, + size_tracker); + + return size_tracker; +} + +template +__host__ void host_unchecked_contains_clear( + CudaStreams streams, CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *inputs, const uint64_t *h_clear_val, + uint32_t num_inputs, uint32_t num_blocks, + int_unchecked_contains_clear_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks) { + + cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val, + num_blocks * sizeof(Torus), streams.stream(0), + streams.gpu_index(0)); + + set_trivial_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val, + mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks, + mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus); + + cuda_event_record(mem_ptr->incoming_event, streams.stream(0), + streams.gpu_index(0)); + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), + mem_ptr->incoming_event, + mem_ptr->sub_streams[j].gpu_index(i)); + } + } + + uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); + + for (uint32_t i = 0; i < num_inputs; i++) { + uint32_t stream_idx = i % num_streams; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; + + CudaRadixCiphertextFFI const *input_ct = &inputs[i]; + + CudaRadixCiphertextFFI current_selector_block; + as_radix_ciphertext_slice(¤t_selector_block, + mem_ptr->packed_selectors, i, i + 1); + + host_equality_check(current_stream, ¤t_selector_block, + input_ct, mem_ptr->tmp_clear_val, + mem_ptr->eq_buffers[stream_idx], bsks, ksks, + num_blocks); + } + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], + streams.gpu_index(0)); + } + } + + host_integer_is_at_least_one_comparisons_block_true( + streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer, + bsks, (Torus **)ksks, num_inputs); +} + +template +uint64_t scratch_cuda_unchecked_is_in_clears( + CudaStreams streams, int_unchecked_is_in_clears_buffer **mem_ptr, + int_radix_params params, uint32_t num_clears, uint32_t num_blocks, + bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_is_in_clears_buffer( + streams, params, num_clears, num_blocks, allocate_gpu_memory, + size_tracker); + + return size_tracker; +} + +template +__host__ void +host_unchecked_is_in_clears(CudaStreams streams, CudaRadixCiphertextFFI *output, + CudaRadixCiphertextFFI const *input, + const uint64_t *h_cleartexts, uint32_t num_clears, + uint32_t num_blocks, + int_unchecked_is_in_clears_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks) { + + host_compute_equality_selectors(streams, mem_ptr->unpacked_selectors, + input, num_blocks, h_cleartexts, + mem_ptr->eq_buffer, bsks, ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer, + bsks, (Torus **)ksks, num_clears); +} + +template +__host__ void host_compute_final_index_from_selectors( + CudaStreams streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors, + uint32_t num_inputs, uint32_t num_blocks_index, + int_final_index_from_selectors_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks) { + + for (uint32_t i = 0; i < num_inputs; i++) { + CudaRadixCiphertextFFI const *src_selector = &selectors[i]; + + copy_radix_ciphertext_slice_async( + streams.stream(0), streams.gpu_index(0), mem_ptr->packed_selectors, i, + i + 1, src_selector, 0, 1); + } + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + host_create_possible_results( + streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors, + num_inputs, mem_ptr->h_indices, packed_len, mem_ptr->possible_results_buf, + bsks, ksks); + + host_aggregate_one_hot_vector( + streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs, + packed_len, mem_ptr->aggregate_buf, bsks, ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf, + bsks, (Torus **)ksks, num_inputs); +} + +template +uint64_t scratch_cuda_compute_final_index_from_selectors( + CudaStreams streams, int_final_index_from_selectors_buffer **mem_ptr, + int_radix_params params, uint32_t num_inputs, uint32_t num_blocks_index, + bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_final_index_from_selectors_buffer( + streams, params, num_inputs, num_blocks_index, allocate_gpu_memory, + size_tracker); + + return size_tracker; +} + +template +uint64_t scratch_cuda_unchecked_index_in_clears( + CudaStreams streams, int_unchecked_index_in_clears_buffer **mem_ptr, + int_radix_params params, uint32_t num_clears, uint32_t num_blocks, + uint32_t num_blocks_index, bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_index_in_clears_buffer( + streams, params, num_clears, num_blocks, num_blocks_index, + allocate_gpu_memory, size_tracker); + + return size_tracker; +} + +template +__host__ void host_unchecked_index_in_clears( + CudaStreams streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input, + const uint64_t *h_cleartexts, uint32_t num_clears, uint32_t num_blocks, + uint32_t num_blocks_index, + int_unchecked_index_in_clears_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks) { + + host_compute_equality_selectors( + streams, mem_ptr->final_index_buf->unpacked_selectors, input, num_blocks, + h_cleartexts, mem_ptr->eq_selectors_buf, bsks, ksks); + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + host_create_possible_results( + streams, mem_ptr->final_index_buf->possible_results_ct_list, + mem_ptr->final_index_buf->unpacked_selectors, num_clears, + mem_ptr->final_index_buf->h_indices, packed_len, + mem_ptr->final_index_buf->possible_results_buf, bsks, ksks); + + host_aggregate_one_hot_vector( + streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list, + num_clears, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks, + ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, match_ct, mem_ptr->final_index_buf->packed_selectors, + mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks, + num_clears); +} + +template +uint64_t scratch_cuda_unchecked_first_index_in_clears( + CudaStreams streams, + int_unchecked_first_index_in_clears_buffer **mem_ptr, + int_radix_params params, uint32_t num_unique, uint32_t num_blocks, + uint32_t num_blocks_index, bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_first_index_in_clears_buffer( + streams, params, num_unique, num_blocks, num_blocks_index, + allocate_gpu_memory, size_tracker); + + return size_tracker; +} + +template +__host__ void host_unchecked_first_index_in_clears( + CudaStreams streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input, + const uint64_t *h_unique_values, const uint64_t *h_unique_indices, + uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index, + int_unchecked_first_index_in_clears_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks) { + + host_compute_equality_selectors(streams, mem_ptr->unpacked_selectors, + input, num_blocks, h_unique_values, + mem_ptr->eq_selectors_buf, bsks, ksks); + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + host_create_possible_results( + streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors, + num_unique, h_unique_indices, packed_len, mem_ptr->possible_results_buf, + bsks, ksks); + + host_aggregate_one_hot_vector( + streams, index_ct, mem_ptr->possible_results_ct_list, num_unique, + packed_len, mem_ptr->aggregate_buf, bsks, ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf, + bsks, (Torus **)ksks, num_unique); +} + +template +uint64_t scratch_cuda_unchecked_first_index_of_clear( + CudaStreams streams, + int_unchecked_first_index_of_clear_buffer **mem_ptr, + int_radix_params params, uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_first_index_of_clear_buffer( + streams, params, num_inputs, num_blocks, num_blocks_index, + allocate_gpu_memory, size_tracker); + + return size_tracker; +} + +template +__host__ void host_unchecked_first_index_of_clear( + CudaStreams streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs, + const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, + int_unchecked_first_index_of_clear_buffer *mem_ptr, + void *const *bsks, Torus *const *ksks) { + + cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val, + num_blocks * sizeof(Torus), streams.stream(0), + streams.gpu_index(0)); + + set_trivial_radix_ciphertext_async( + streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val, + mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks, + mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus); + + cuda_event_record(mem_ptr->incoming_event, streams.stream(0), + streams.gpu_index(0)); + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), + mem_ptr->incoming_event, + mem_ptr->sub_streams[j].gpu_index(i)); + } + } + + uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); + + for (uint32_t i = 0; i < num_inputs; i++) { + uint32_t stream_idx = i % num_streams; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; + + CudaRadixCiphertextFFI const *input_ct = &inputs[i]; + + CudaRadixCiphertextFFI current_selector_block; + as_radix_ciphertext_slice(¤t_selector_block, + mem_ptr->packed_selectors, i, i + 1); + + host_equality_check(current_stream, ¤t_selector_block, + input_ct, mem_ptr->tmp_clear_val, + mem_ptr->eq_buffers[stream_idx], bsks, ksks, + num_blocks); + } + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], + streams.gpu_index(0)); + } + } + + for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) { + uint32_t count = num_inputs - offset; + + CudaRadixCiphertextFFI current_slice; + as_radix_ciphertext_slice(¤t_slice, mem_ptr->packed_selectors, + offset, num_inputs); + + CudaRadixCiphertextFFI prev_slice; + as_radix_ciphertext_slice(&prev_slice, mem_ptr->packed_selectors, 0, + count); + + integer_radix_apply_bivariate_lookup_table( + streams, ¤t_slice, ¤t_slice, &prev_slice, bsks, ksks, + mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus); + } + + integer_radix_apply_univariate_lookup_table( + streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks, + mem_ptr->cleanup_lut, num_inputs); + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + host_create_possible_results( + streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors, + num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len, + mem_ptr->possible_results_buf, bsks, ksks); + + host_aggregate_one_hot_vector( + streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs, + packed_len, mem_ptr->aggregate_buf, bsks, ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf, + bsks, (Torus **)ksks, num_inputs); +} + +template +uint64_t scratch_cuda_unchecked_first_index_of( + CudaStreams streams, int_unchecked_first_index_of_buffer **mem_ptr, + int_radix_params params, uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_first_index_of_buffer( + streams, params, num_inputs, num_blocks, num_blocks_index, + allocate_gpu_memory, size_tracker); + + return size_tracker; +} + +template +__host__ void host_unchecked_first_index_of( + CudaStreams streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, uint32_t num_inputs, + uint32_t num_blocks, uint32_t num_blocks_index, + int_unchecked_first_index_of_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks) { + + cuda_event_record(mem_ptr->incoming_event, streams.stream(0), + streams.gpu_index(0)); + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), + mem_ptr->incoming_event, + mem_ptr->sub_streams[j].gpu_index(i)); + } + } + + uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); + + for (uint32_t i = 0; i < num_inputs; i++) { + uint32_t stream_idx = i % num_streams; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; + + CudaRadixCiphertextFFI const *input_ct = &inputs[i]; + + CudaRadixCiphertextFFI current_selector_block; + as_radix_ciphertext_slice(¤t_selector_block, + mem_ptr->packed_selectors, i, i + 1); + + host_equality_check(current_stream, ¤t_selector_block, + input_ct, value, mem_ptr->eq_buffers[stream_idx], + bsks, ksks, num_blocks); + } + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], + streams.gpu_index(0)); + } + } + + for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) { + uint32_t count = num_inputs - offset; + + CudaRadixCiphertextFFI current_slice; + as_radix_ciphertext_slice(¤t_slice, mem_ptr->packed_selectors, + offset, num_inputs); + + CudaRadixCiphertextFFI prev_slice; + as_radix_ciphertext_slice(&prev_slice, mem_ptr->packed_selectors, 0, + count); + + integer_radix_apply_bivariate_lookup_table( + streams, ¤t_slice, ¤t_slice, &prev_slice, bsks, ksks, + mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus); + } + + integer_radix_apply_univariate_lookup_table( + streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks, + mem_ptr->cleanup_lut, num_inputs); + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + host_create_possible_results( + streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors, + num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len, + mem_ptr->possible_results_buf, bsks, ksks); + + host_aggregate_one_hot_vector( + streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs, + packed_len, mem_ptr->aggregate_buf, bsks, ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf, + bsks, (Torus **)ksks, num_inputs); +} + +template +uint64_t scratch_cuda_unchecked_index_of( + CudaStreams streams, int_unchecked_index_of_buffer **mem_ptr, + int_radix_params params, uint32_t num_inputs, uint32_t num_blocks, + uint32_t num_blocks_index, bool allocate_gpu_memory) { + + uint64_t size_tracker = 0; + *mem_ptr = new int_unchecked_index_of_buffer( + streams, params, num_inputs, num_blocks, num_blocks_index, + allocate_gpu_memory, size_tracker); + + return size_tracker; +} + +template +__host__ void host_unchecked_index_of( + CudaStreams streams, CudaRadixCiphertextFFI *index_ct, + CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs, + CudaRadixCiphertextFFI const *value, uint32_t num_inputs, + uint32_t num_blocks, uint32_t num_blocks_index, + int_unchecked_index_of_buffer *mem_ptr, void *const *bsks, + Torus *const *ksks) { + + cuda_event_record(mem_ptr->incoming_event, streams.stream(0), + streams.gpu_index(0)); + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i), + mem_ptr->incoming_event, + mem_ptr->sub_streams[j].gpu_index(i)); + } + } + + uint32_t num_streams = mem_ptr->num_streams; + uint32_t num_gpus = mem_ptr->active_streams.count(); + + for (uint32_t i = 0; i < num_inputs; i++) { + uint32_t stream_idx = i % num_streams; + CudaStreams current_stream = mem_ptr->sub_streams[stream_idx]; + + CudaRadixCiphertextFFI const *input_ct = &inputs[i]; + + CudaRadixCiphertextFFI current_selector_block; + as_radix_ciphertext_slice(¤t_selector_block, + mem_ptr->final_index_buf->packed_selectors, + i, i + 1); + + host_equality_check(current_stream, ¤t_selector_block, + input_ct, value, mem_ptr->eq_buffers[stream_idx], + bsks, ksks, num_blocks); + } + + for (uint32_t j = 0; j < mem_ptr->num_streams; j++) { + for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) { + cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i], + mem_ptr->sub_streams[j].stream(i), + mem_ptr->sub_streams[j].gpu_index(i)); + cuda_stream_wait_event(streams.stream(0), + mem_ptr->outgoing_events[j * num_gpus + i], + streams.gpu_index(0)); + } + } + + uint32_t packed_len = (num_blocks_index + 1) / 2; + + host_create_possible_results( + streams, mem_ptr->final_index_buf->possible_results_ct_list, + mem_ptr->final_index_buf->unpacked_selectors, num_inputs, + (const uint64_t *)mem_ptr->final_index_buf->h_indices, packed_len, + mem_ptr->final_index_buf->possible_results_buf, bsks, ksks); + + host_aggregate_one_hot_vector( + streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list, + num_inputs, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks, + ksks); + + host_integer_is_at_least_one_comparisons_block_true( + streams, match_ct, mem_ptr->final_index_buf->packed_selectors, + mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks, + num_inputs); +} diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index de14e045e..875a520c4 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -1100,45 +1100,6 @@ unsafe extern "C" { unsafe extern "C" { pub fn cleanup_cuda_integer_div_rem(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8); } -unsafe extern "C" { - pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - streams: CudaStreamsFFI, - mem_ptr: *mut *mut i8, - input_lut: *const ffi::c_void, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_radix_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: PBS_TYPE, - lut_degree: u64, - allocate_gpu_memory: bool, - noise_reduction_type: PBS_MS_REDUCTION_T, - ) -> u64; -} -unsafe extern "C" { - pub fn cuda_integer_compute_prefix_sum_hillis_steele_64( - streams: CudaStreamsFFI, - output_radix_lwe: *mut CudaRadixCiphertextFFI, - generates_or_propagates: *mut CudaRadixCiphertextFFI, - mem_ptr: *mut i8, - ksks: *const *mut ffi::c_void, - bsks: *const *mut ffi::c_void, - num_blocks: u32, - ); -} -unsafe extern "C" { - pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - streams: CudaStreamsFFI, - mem_ptr_void: *mut *mut i8, - ); -} unsafe extern "C" { pub fn cuda_integer_reverse_blocks_64_inplace( streams: CudaStreamsFFI, @@ -1715,127 +1676,6 @@ unsafe extern "C" { unsafe extern "C" { pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8); } -unsafe extern "C" { - pub fn scratch_cuda_compute_equality_selectors_64( - streams: CudaStreamsFFI, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_possible_values: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: PBS_TYPE, - allocate_gpu_memory: bool, - noise_reduction_type: PBS_MS_REDUCTION_T, - ) -> u64; -} -unsafe extern "C" { - pub fn cuda_compute_equality_selectors_64( - streams: CudaStreamsFFI, - lwe_array_out_list: *mut CudaRadixCiphertextFFI, - lwe_array_in: *const CudaRadixCiphertextFFI, - num_blocks: u32, - h_decomposed_cleartexts: *const u64, - mem: *mut i8, - bsks: *const *mut ffi::c_void, - ksks: *const *mut ffi::c_void, - ); -} -unsafe extern "C" { - pub fn cleanup_cuda_compute_equality_selectors_64( - streams: CudaStreamsFFI, - mem_ptr_void: *mut *mut i8, - ); -} -unsafe extern "C" { - pub fn scratch_cuda_create_possible_results_64( - streams: CudaStreamsFFI, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_possible_values: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: PBS_TYPE, - allocate_gpu_memory: bool, - noise_reduction_type: PBS_MS_REDUCTION_T, - ) -> u64; -} -unsafe extern "C" { - pub fn cuda_create_possible_results_64( - streams: CudaStreamsFFI, - lwe_array_out_list: *mut CudaRadixCiphertextFFI, - lwe_array_in_list: *const CudaRadixCiphertextFFI, - num_possible_values: u32, - h_decomposed_cleartexts: *const u64, - num_blocks: u32, - mem: *mut i8, - bsks: *const *mut ffi::c_void, - ksks: *const *mut ffi::c_void, - ); -} -unsafe extern "C" { - pub fn cleanup_cuda_create_possible_results_64( - streams: CudaStreamsFFI, - mem_ptr_void: *mut *mut i8, - ); -} -unsafe extern "C" { - pub fn scratch_cuda_aggregate_one_hot_vector_64( - streams: CudaStreamsFFI, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - num_matches: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: PBS_TYPE, - allocate_gpu_memory: bool, - noise_reduction_type: PBS_MS_REDUCTION_T, - ) -> u64; -} -unsafe extern "C" { - pub fn cuda_aggregate_one_hot_vector_64( - streams: CudaStreamsFFI, - lwe_array_out: *mut CudaRadixCiphertextFFI, - lwe_array_in_list: *const CudaRadixCiphertextFFI, - num_input_ciphertexts: u32, - num_blocks: u32, - mem: *mut i8, - bsks: *const *mut ffi::c_void, - ksks: *const *mut ffi::c_void, - ); -} -unsafe extern "C" { - pub fn cleanup_cuda_aggregate_one_hot_vector_64( - streams: CudaStreamsFFI, - mem_ptr_void: *mut *mut i8, - ); -} unsafe extern "C" { pub fn scratch_cuda_unchecked_match_value_64( streams: CudaStreamsFFI, @@ -1962,6 +1802,385 @@ unsafe extern "C" { mem_ptr_void: *mut *mut i8, ); } +unsafe extern "C" { + pub fn scratch_cuda_unchecked_contains_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_inputs: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_contains_64( + streams: CudaStreamsFFI, + output: *mut CudaRadixCiphertextFFI, + inputs: *const CudaRadixCiphertextFFI, + value: *const CudaRadixCiphertextFFI, + num_inputs: u32, + num_blocks: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_contains_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_contains_clear_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_inputs: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_contains_clear_64( + streams: CudaStreamsFFI, + output: *mut CudaRadixCiphertextFFI, + inputs: *const CudaRadixCiphertextFFI, + h_clear_val: *const u64, + num_inputs: u32, + num_blocks: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_contains_clear_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_is_in_clears_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_clears: u32, + num_blocks: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_is_in_clears_64( + streams: CudaStreamsFFI, + output: *mut CudaRadixCiphertextFFI, + input: *const CudaRadixCiphertextFFI, + h_cleartexts: *const u64, + num_clears: u32, + num_blocks: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_is_in_clears_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_compute_final_index_from_selectors_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_inputs: u32, + num_blocks_index: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_compute_final_index_from_selectors_64( + streams: CudaStreamsFFI, + index_ct: *mut CudaRadixCiphertextFFI, + match_ct: *mut CudaRadixCiphertextFFI, + selectors: *const CudaRadixCiphertextFFI, + num_inputs: u32, + num_blocks_index: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_compute_final_index_from_selectors_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_index_in_clears_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_clears: u32, + num_blocks: u32, + num_blocks_index: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_index_in_clears_64( + streams: CudaStreamsFFI, + index_ct: *mut CudaRadixCiphertextFFI, + match_ct: *mut CudaRadixCiphertextFFI, + input: *const CudaRadixCiphertextFFI, + h_cleartexts: *const u64, + num_clears: u32, + num_blocks: u32, + num_blocks_index: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_index_in_clears_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_first_index_in_clears_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_unique: u32, + num_blocks: u32, + num_blocks_index: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_first_index_in_clears_64( + streams: CudaStreamsFFI, + index_ct: *mut CudaRadixCiphertextFFI, + match_ct: *mut CudaRadixCiphertextFFI, + input: *const CudaRadixCiphertextFFI, + h_unique_values: *const u64, + h_unique_indices: *const u64, + num_unique: u32, + num_blocks: u32, + num_blocks_index: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_first_index_in_clears_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_first_index_of_clear_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_inputs: u32, + num_blocks: u32, + num_blocks_index: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_first_index_of_clear_64( + streams: CudaStreamsFFI, + index_ct: *mut CudaRadixCiphertextFFI, + match_ct: *mut CudaRadixCiphertextFFI, + inputs: *const CudaRadixCiphertextFFI, + h_clear_val: *const u64, + num_inputs: u32, + num_blocks: u32, + num_blocks_index: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_first_index_of_clear_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_first_index_of_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_inputs: u32, + num_blocks: u32, + num_blocks_index: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_first_index_of_64( + streams: CudaStreamsFFI, + index_ct: *mut CudaRadixCiphertextFFI, + match_ct: *mut CudaRadixCiphertextFFI, + inputs: *const CudaRadixCiphertextFFI, + value: *const CudaRadixCiphertextFFI, + num_inputs: u32, + num_blocks: u32, + num_blocks_index: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_first_index_of_64( + streams: CudaStreamsFFI, + mem_ptr_void: *mut *mut i8, + ); +} +unsafe extern "C" { + pub fn scratch_cuda_unchecked_index_of_64( + streams: CudaStreamsFFI, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_inputs: u32, + num_blocks: u32, + num_blocks_index: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: PBS_TYPE, + allocate_gpu_memory: bool, + noise_reduction_type: PBS_MS_REDUCTION_T, + ) -> u64; +} +unsafe extern "C" { + pub fn cuda_unchecked_index_of_64( + streams: CudaStreamsFFI, + index_ct: *mut CudaRadixCiphertextFFI, + match_ct: *mut CudaRadixCiphertextFFI, + inputs: *const CudaRadixCiphertextFFI, + value: *const CudaRadixCiphertextFFI, + num_inputs: u32, + num_blocks: u32, + num_blocks_index: u32, + mem: *mut i8, + bsks: *const *mut ffi::c_void, + ksks: *const *mut ffi::c_void, + ); +} +unsafe extern "C" { + pub fn cleanup_cuda_unchecked_index_of_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8); +} unsafe extern "C" { pub fn scratch_cuda_integer_compress_radix_ciphertext_64( streams: CudaStreamsFFI, diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index a6e65b5ee..98b47a6ca 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -32,10 +32,13 @@ use crate::prelude::{CastFrom, CastInto}; use crate::shortint::ciphertext::{Degree, NoiseLevel}; use crate::shortint::parameters::ModulusSwitchType; use crate::shortint::{CarryModulus, MessageModulus}; +use crate::MatchValues; use itertools::Itertools; +use rayon::prelude::*; pub use server_key::CudaServerKey; use std::any::TypeId; use std::cmp::min; +use std::hash::Hash; use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; @@ -6795,124 +6798,6 @@ pub(crate) unsafe fn cuda_backend_ilog2( update_noise_degree(output, &cuda_ffi_output); } -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - The data must not be moved or dropped while being used by the CUDA kernel. -/// - This function assumes exclusive access to the passed data; violating this may lead to -/// undefined behavior. -pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele< - T: UnsignedInteger, - B: Numeric, ->( - streams: &CudaStreams, - output: &mut CudaSliceMut, - output_degrees: &mut Vec, - output_noise_levels: &mut Vec, - generates_or_propagates: &mut CudaSliceMut, - generates_or_propagates_degrees: &mut Vec, - generates_or_propagates_noise_levels: &mut Vec, - input_lut: &[T], - lut_degree: u64, - bootstrapping_key: &CudaVec, - keyswitch_key: &CudaVec, - lwe_dimension: LweDimension, - glwe_dimension: GlweDimension, - polynomial_size: PolynomialSize, - ks_level: DecompositionLevelCount, - ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - num_blocks: u32, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - pbs_type: PBSType, - grouping_factor: LweBskGroupingFactor, - ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, -) { - assert_eq!( - streams.gpu_indexes[0], - generates_or_propagates.gpu_index(0), - "GPU error: first stream is on GPU {}, first generates_or_propagates pointer is on GPU {}", - streams.gpu_indexes[0].get(), - generates_or_propagates.gpu_index(0).get(), - ); - assert_eq!( - streams.gpu_indexes[0], - output.gpu_index(0), - "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", - streams.gpu_indexes[0].get(), - output.gpu_index(0).get(), - ); - assert_eq!( - streams.gpu_indexes[0], - bootstrapping_key.gpu_index(0), - "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}", - streams.gpu_indexes[0].get(), - bootstrapping_key.gpu_index(0).get(), - ); - assert_eq!( - streams.gpu_indexes[0], - keyswitch_key.gpu_index(0), - "GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}", - streams.gpu_indexes[0].get(), - keyswitch_key.gpu_index(0).get(), - ); - - let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); - - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut( - output, - output_degrees, - output_noise_levels, - num_blocks, - (glwe_dimension.0 * polynomial_size.0) as u32, - ); - let mut cuda_ffi_generates_or_propagates = prepare_cuda_radix_ffi_from_slice_mut( - generates_or_propagates, - generates_or_propagates_degrees, - generates_or_propagates_noise_levels, - num_blocks, - (glwe_dimension.0 * polynomial_size.0) as u32, - ); - scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( - streams.ffi(), - std::ptr::addr_of_mut!(mem_ptr), - input_lut.as_ptr().cast(), - lwe_dimension.0 as u32, - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - ks_level.0 as u32, - ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_blocks, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - lut_degree, - true, - noise_reduction_type as u32, - ); - - cuda_integer_compute_prefix_sum_hillis_steele_64( - streams.ffi(), - &raw mut cuda_ffi_output, - &raw mut cuda_ffi_generates_or_propagates, - mem_ptr, - keyswitch_key.ptr.as_ptr(), - bootstrapping_key.ptr.as_ptr(), - num_blocks, - ); - - cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64( - streams.ffi(), - std::ptr::addr_of_mut!(mem_ptr), - ); -} - #[allow(clippy::too_many_arguments)] /// # Safety /// @@ -8431,402 +8316,6 @@ pub(crate) unsafe fn cuda_backend_unchecked_bitnot_assign( update_noise_degree(ciphertext, &cuda_ffi_ciphertext); } -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - The data must not be moved or dropped while being used by the CUDA kernel. -/// - This function assumes exclusive access to the passed data; violating this may lead to -/// undefined behavior. -pub(crate) unsafe fn cuda_backend_compute_equality_selectors( - streams: &CudaStreams, - lwe_array_out_list: &mut [CudaBooleanBlock], - lwe_array_in: &CudaRadixCiphertext, - h_decomposed_cleartexts: &[u64], - num_possible_values: u32, - num_blocks: u32, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - bootstrapping_key: &CudaVec, - keyswitch_key: &CudaVec, - glwe_dimension: GlweDimension, - polynomial_size: PolynomialSize, - big_lwe_dimension: LweDimension, - small_lwe_dimension: LweDimension, - ks_level: DecompositionLevelCount, - ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - pbs_type: PBSType, - grouping_factor: LweBskGroupingFactor, - ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, -) { - assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); - assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); - assert_eq!( - streams.gpu_indexes[0], - lwe_array_in.d_blocks.0.d_vec.gpu_index(0) - ); - - let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); - - let mut ffi_out_degrees: Vec> = Vec::with_capacity(lwe_array_out_list.len()); - let mut ffi_out_noise_levels: Vec> = Vec::with_capacity(lwe_array_out_list.len()); - - let mut ffi_out_structs: Vec = lwe_array_out_list - .iter_mut() - .map(|ct| { - assert_eq!( - streams.gpu_indexes[0], - ct.0.ciphertext.d_blocks.0.d_vec.gpu_index(0) - ); - ffi_out_degrees.push(vec![ct.0.ciphertext.info.blocks[0].degree.get()]); - ffi_out_noise_levels.push(vec![ct.0.ciphertext.info.blocks[0].noise_level.0]); - prepare_cuda_radix_ffi( - &ct.0.ciphertext, - ffi_out_degrees.last_mut().unwrap(), - ffi_out_noise_levels.last_mut().unwrap(), - ) - }) - .collect(); - - let mut ffi_in_degrees: Vec = lwe_array_in - .info - .blocks - .iter() - .map(|b| b.degree.get()) - .collect(); - let mut ffi_in_noise_levels: Vec = lwe_array_in - .info - .blocks - .iter() - .map(|b| b.noise_level.0) - .collect(); - - let ffi_in_struct: CudaRadixCiphertextFFI = - prepare_cuda_radix_ffi(lwe_array_in, &mut ffi_in_degrees, &mut ffi_in_noise_levels); - - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - scratch_cuda_compute_equality_selectors_64( - streams.ffi(), - std::ptr::addr_of_mut!(mem_ptr), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - big_lwe_dimension.0 as u32, - small_lwe_dimension.0 as u32, - ks_level.0 as u32, - ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_possible_values, - num_blocks, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - true, - noise_reduction_type as u32, - ); - - cuda_compute_equality_selectors_64( - streams.ffi(), - ffi_out_structs.as_mut_ptr(), - &raw const ffi_in_struct, - num_blocks, - h_decomposed_cleartexts.as_ptr(), - mem_ptr, - bootstrapping_key.ptr.as_ptr(), - keyswitch_key.ptr.as_ptr(), - ); - - cleanup_cuda_compute_equality_selectors_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); - - for (ct, ffi_struct) in lwe_array_out_list.iter_mut().zip(ffi_out_structs.iter()) { - update_noise_degree(&mut ct.0.ciphertext, ffi_struct); - } -} - -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - The data must not be moved or dropped while being used by the CUDA kernel. -/// - This function assumes exclusive access to the passed data; violating this may lead to -/// undefined behavior. -pub(crate) unsafe fn cuda_backend_create_possible_results< - T: UnsignedInteger, - B: Numeric, - R: CudaIntegerRadixCiphertext, ->( - streams: &CudaStreams, - lwe_array_out_list: &mut [R], - lwe_array_in_list: &[CudaBooleanBlock], - h_decomposed_cleartexts: &[u64], - num_blocks: u32, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - bootstrapping_key: &CudaVec, - keyswitch_key: &CudaVec, - glwe_dimension: GlweDimension, - polynomial_size: PolynomialSize, - big_lwe_dimension: LweDimension, - small_lwe_dimension: LweDimension, - ks_level: DecompositionLevelCount, - ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - pbs_type: PBSType, - grouping_factor: LweBskGroupingFactor, - ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, -) { - assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); - assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); - - let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); - - let mut ffi_out_degrees: Vec> = Vec::with_capacity(lwe_array_out_list.len()); - let mut ffi_out_noise_levels: Vec> = Vec::with_capacity(lwe_array_out_list.len()); - - let mut ffi_out_structs: Vec = lwe_array_out_list - .iter_mut() - .map(|ct| { - assert_eq!( - streams.gpu_indexes[0], - ct.as_ref().d_blocks.0.d_vec.gpu_index(0) - ); - let degrees: Vec = ct - .as_ref() - .info - .blocks - .iter() - .map(|b| b.degree.get()) - .collect(); - let noise_levels: Vec = ct - .as_ref() - .info - .blocks - .iter() - .map(|b| b.noise_level.0) - .collect(); - - ffi_out_degrees.push(degrees); - ffi_out_noise_levels.push(noise_levels); - - prepare_cuda_radix_ffi( - ct.as_ref(), - ffi_out_degrees.last_mut().unwrap(), - ffi_out_noise_levels.last_mut().unwrap(), - ) - }) - .collect(); - - let mut ffi_in_degrees: Vec> = Vec::with_capacity(lwe_array_in_list.len()); - let mut ffi_in_noise_levels: Vec> = Vec::with_capacity(lwe_array_in_list.len()); - - let ffi_in_structs: Vec = lwe_array_in_list - .iter() - .map(|boolean_block| { - assert_eq!( - streams.gpu_indexes[0], - boolean_block.0.ciphertext.d_blocks.0.d_vec.gpu_index(0) - ); - - let degrees = vec![boolean_block.0.ciphertext.info.blocks[0].degree.get()]; - let noise_levels = vec![boolean_block.0.ciphertext.info.blocks[0].noise_level.0]; - - ffi_in_degrees.push(degrees); - ffi_in_noise_levels.push(noise_levels); - - prepare_cuda_radix_ffi( - &boolean_block.0.ciphertext, - ffi_in_degrees.last_mut().unwrap(), - ffi_in_noise_levels.last_mut().unwrap(), - ) - }) - .collect(); - - let num_possible_values = lwe_array_in_list.len() as u32; - - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - scratch_cuda_create_possible_results_64( - streams.ffi(), - std::ptr::addr_of_mut!(mem_ptr), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - big_lwe_dimension.0 as u32, - small_lwe_dimension.0 as u32, - ks_level.0 as u32, - ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_possible_values, - num_blocks, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - true, - noise_reduction_type as u32, - ); - - cuda_create_possible_results_64( - streams.ffi(), - ffi_out_structs.as_mut_ptr(), - ffi_in_structs.as_ptr(), - num_possible_values, - h_decomposed_cleartexts.as_ptr(), - num_blocks, - mem_ptr, - bootstrapping_key.ptr.as_ptr(), - keyswitch_key.ptr.as_ptr(), - ); - - cleanup_cuda_create_possible_results_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); - - for (i, ct) in lwe_array_out_list.iter_mut().enumerate() { - update_noise_degree(ct.as_mut(), &ffi_out_structs[i]); - } -} - -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - The data must not be moved or dropped while being used by the CUDA kernel. -/// - This function assumes exclusive access to the passed data; violating this may lead to -/// undefined behavior. -pub(crate) unsafe fn cuda_backend_aggregate_one_hot_vector< - T: UnsignedInteger, - B: Numeric, - R: CudaIntegerRadixCiphertext, ->( - streams: &CudaStreams, - lwe_array_out: &mut R, - lwe_array_in_list: &[R], - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - bootstrapping_key: &CudaVec, - keyswitch_key: &CudaVec, - glwe_dimension: GlweDimension, - polynomial_size: PolynomialSize, - big_lwe_dimension: LweDimension, - small_lwe_dimension: LweDimension, - ks_level: DecompositionLevelCount, - ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - pbs_type: PBSType, - grouping_factor: LweBskGroupingFactor, - ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, -) { - assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); - assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); - assert_eq!( - streams.gpu_indexes[0], - lwe_array_out.as_ref().d_blocks.0.d_vec.gpu_index(0) - ); - - let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); - - let mut ffi_out_degrees: Vec = lwe_array_out - .as_ref() - .info - .blocks - .iter() - .map(|b| b.degree.get()) - .collect(); - let mut ffi_out_noise_levels: Vec = lwe_array_out - .as_ref() - .info - .blocks - .iter() - .map(|b| b.noise_level.0) - .collect(); - - let mut ffi_out_struct = prepare_cuda_radix_ffi( - lwe_array_out.as_ref(), - &mut ffi_out_degrees, - &mut ffi_out_noise_levels, - ); - - let mut ffi_in_degrees: Vec> = Vec::with_capacity(lwe_array_in_list.len()); - let mut ffi_in_noise_levels: Vec> = Vec::with_capacity(lwe_array_in_list.len()); - - let ffi_in_structs: Vec = lwe_array_in_list - .iter() - .map(|ct| { - assert_eq!( - streams.gpu_indexes[0], - ct.as_ref().d_blocks.0.d_vec.gpu_index(0) - ); - let degrees: Vec = ct - .as_ref() - .info - .blocks - .iter() - .map(|b| b.degree.get()) - .collect(); - let noise_levels: Vec = ct - .as_ref() - .info - .blocks - .iter() - .map(|b| b.noise_level.0) - .collect(); - - ffi_in_degrees.push(degrees); - ffi_in_noise_levels.push(noise_levels); - - prepare_cuda_radix_ffi( - ct.as_ref(), - ffi_in_degrees.last_mut().unwrap(), - ffi_in_noise_levels.last_mut().unwrap(), - ) - }) - .collect(); - - let num_input_ciphertexts = lwe_array_in_list.len() as u32; - let num_blocks = lwe_array_in_list[0] - .as_ref() - .d_blocks - .lwe_ciphertext_count() - .0 as u32; - - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - scratch_cuda_aggregate_one_hot_vector_64( - streams.ffi(), - std::ptr::addr_of_mut!(mem_ptr), - glwe_dimension.0 as u32, - polynomial_size.0 as u32, - big_lwe_dimension.0 as u32, - small_lwe_dimension.0 as u32, - ks_level.0 as u32, - ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_blocks, - num_input_ciphertexts, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - true, - noise_reduction_type as u32, - ); - - cuda_aggregate_one_hot_vector_64( - streams.ffi(), - &raw mut ffi_out_struct, - ffi_in_structs.as_ptr(), - num_input_ciphertexts, - num_blocks, - mem_ptr, - bootstrapping_key.ptr.as_ptr(), - keyswitch_key.ptr.as_ptr(), - ); - - cleanup_cuda_aggregate_one_hot_vector_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); - - update_noise_degree(lwe_array_out.as_mut(), &ffi_out_struct); -} - #[allow(clippy::too_many_arguments)] /// # Safety /// @@ -8837,17 +8326,13 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value< T: UnsignedInteger, B: Numeric, R: CudaIntegerRadixCiphertext, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, >( streams: &CudaStreams, lwe_array_out_result: &mut R, lwe_array_out_boolean: &mut CudaBooleanBlock, lwe_array_in_ct: &CudaRadixCiphertext, - h_match_inputs: &[u64], - h_match_outputs: &[u64], - num_matches: u32, - num_input_blocks: u32, - num_output_packed_blocks: u32, - max_output_is_zero: bool, + matches: &MatchValues, message_modulus: MessageModulus, carry_modulus: CarryModulus, bootstrapping_key: &CudaVec, @@ -8885,6 +8370,50 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value< .gpu_index(0) ); + let num_input_blocks = lwe_array_in_ct.d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + let h_match_inputs: Vec = matches + .get_values() + .par_iter() + .map(|(input, _output)| *input) + .flat_map(|input_value: Clear| { + BlockDecomposer::new(input_value, num_bits_in_message) + .take(num_input_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect(); + + let max_output_value = matches + .get_values() + .iter() + .copied() + .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr)) + .expect("luts is not empty at this point") + .1; + + let num_output_unpacked_blocks = lwe_array_out_result + .as_ref() + .d_blocks + .lwe_ciphertext_count() + .0 as u32; + let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2); + + let h_match_outputs: Vec = matches + .get_values() + .par_iter() + .map(|(_input, output)| *output) + .flat_map(|output_value: Clear| { + BlockDecomposer::new(output_value, 2 * num_bits_in_message) + .take(num_output_packed_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect(); + + let max_output_is_zero = max_output_value == Clear::ZERO; + let num_matches = matches.get_values().len() as u32; + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); let mut ffi_out_result_degrees: Vec = lwe_array_out_result @@ -8993,8 +8522,10 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value< /// - The data must not be moved or dropped while being used by the CUDA kernel. /// - This function assumes exclusive access to the passed data; violating this may lead to /// undefined behavior. -pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu( +pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu( streams: &CudaStreams, + ct: &CudaRadixCiphertext, + matches: &MatchValues, glwe_dimension: GlweDimension, polynomial_size: PolynomialSize, big_lwe_dimension: LweDimension, @@ -9007,12 +8538,34 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu( message_modulus: MessageModulus, carry_modulus: CarryModulus, pbs_type: PBSType, - num_matches: u32, - num_input_blocks: u32, - num_output_packed_blocks: u32, - max_output_is_zero: bool, ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, -) -> u64 { +) -> u64 +where + Clear: UnsignedInteger + DecomposableInto + CastInto + CastInto + Sync + Send, +{ + let num_input_blocks = ct.d_blocks.lwe_ciphertext_count().0 as u32; + + let max_output_value = matches + .get_values() + .iter() + .copied() + .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr)) + .expect("luts is not empty at this point") + .1; + + let num_bits_in_message = message_modulus.0.ilog2(); + let max_val_u64: u64 = max_output_value.cast_into(); + + let num_output_unpacked_blocks = if max_val_u64 == 0 { + 1 + } else { + (max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message) + }; + + let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2); + let max_output_is_zero = max_output_value == Clear::ZERO; + let num_matches = matches.get_values().len() as u32; + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); @@ -9054,8 +8607,11 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu( /// - The data must not be moved or dropped while being used by the CUDA kernel. /// - This function assumes exclusive access to the passed data; violating this may lead to /// undefined behavior. -pub(crate) fn cuda_backend_get_unchecked_match_value_or_size_on_gpu( +pub(crate) fn cuda_backend_get_unchecked_match_value_or_size_on_gpu( streams: &CudaStreams, + ct: &CudaRadixCiphertext, + matches: &MatchValues, + or_value: Clear, glwe_dimension: GlweDimension, polynomial_size: PolynomialSize, big_lwe_dimension: LweDimension, @@ -9068,13 +8624,41 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_or_size_on_gpu( message_modulus: MessageModulus, carry_modulus: CarryModulus, pbs_type: PBSType, - num_matches: u32, - num_input_blocks: u32, - num_match_packed_blocks: u32, - num_output_blocks: u32, - max_output_is_zero: bool, ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, -) -> u64 { +) -> u64 +where + Clear: UnsignedInteger + DecomposableInto + CastInto + CastInto + Sync + Send, +{ + let num_input_blocks = ct.d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + + let max_output_value = matches + .get_values() + .iter() + .copied() + .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr)) + .expect("luts is not empty at this point") + .1; + + let max_val_u64: u64 = max_output_value.cast_into(); + let or_val_u64: u64 = or_value.cast_into(); + + let calc_blocks = |val: u64| -> u32 { + if val == 0 { + 1 + } else { + (val.ilog2() + 1).div_ceil(num_bits_in_message) + } + }; + + let num_blocks_match = calc_blocks(max_val_u64); + let num_blocks_or = calc_blocks(or_val_u64); + + let num_output_blocks = num_blocks_match.max(num_blocks_or); + let num_match_packed_blocks = num_blocks_match.div_ceil(2); + let max_output_is_zero = max_output_value == Clear::ZERO; + let num_matches = matches.get_values().len() as u32; + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); @@ -9210,18 +8794,13 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or< T: UnsignedInteger, B: Numeric, R: CudaIntegerRadixCiphertext, + Clear: UnsignedInteger + DecomposableInto + CastInto + CastInto + Sync + Send, >( streams: &CudaStreams, lwe_array_out: &mut R, lwe_array_in_ct: &CudaRadixCiphertext, - h_match_inputs: &[u64], - h_match_outputs: &[u64], - h_or_value: &[u64], - num_matches: u32, - num_input_blocks: u32, - num_match_packed_blocks: u32, - num_final_blocks: u32, - max_output_is_zero: bool, + matches: &MatchValues, + or_value: Clear, message_modulus: MessageModulus, carry_modulus: CarryModulus, bootstrapping_key: &CudaVec, @@ -9238,8 +8817,72 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or< grouping_factor: LweBskGroupingFactor, ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, ) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!( + streams.gpu_indexes[0], + lwe_array_in_ct.d_blocks.0.d_vec.gpu_index(0) + ); + assert_eq!( + streams.gpu_indexes[0], + lwe_array_out.as_ref().d_blocks.0.d_vec.gpu_index(0) + ); + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + let num_input_blocks = lwe_array_in_ct.d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + + let h_match_inputs: Vec = matches + .get_values() + .par_iter() + .map(|(input, _output)| *input) + .flat_map(|input_value: Clear| { + BlockDecomposer::new(input_value, num_bits_in_message) + .take(num_input_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect(); + + let max_output_value = matches + .get_values() + .iter() + .copied() + .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr)) + .expect("luts is not empty at this point") + .1; + + let max_val_u64: u64 = max_output_value.cast_into(); + let num_blocks_match = if max_val_u64 == 0 { + 1 + } else { + (max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message) + }; + let num_match_packed_blocks = num_blocks_match.div_ceil(2); + + let h_match_outputs: Vec = matches + .get_values() + .par_iter() + .map(|(_input, output)| *output) + .flat_map(|output_value: Clear| { + BlockDecomposer::new(output_value, 2 * num_bits_in_message) + .take(num_match_packed_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect(); + + let num_final_blocks = lwe_array_out.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; + + let h_or_value: Vec = BlockDecomposer::new(or_value, num_bits_in_message) + .take(num_final_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect(); + + let max_output_is_zero = max_output_value == Clear::ZERO; + let num_matches = matches.get_values().len() as u32; + let mut ffi_out_degrees: Vec = lwe_array_out .as_ref() .info @@ -9320,3 +8963,1192 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or< update_noise_degree(lwe_array_out.as_mut(), &ffi_out_struct); } + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_contains< + T: UnsignedInteger, + B: Numeric, + C: CudaIntegerRadixCiphertext, +>( + streams: &CudaStreams, + output: &mut CudaBooleanBlock, + inputs: &[C], + value: &CudaRadixCiphertext, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], value.d_blocks.0.d_vec.gpu_index(0)); + + let num_inputs = inputs.len() as u32; + let num_blocks = value.d_blocks.lwe_ciphertext_count().0 as u32; + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut output_degrees = vec![output.0.ciphertext.info.blocks[0].degree.get()]; + let mut output_noise_levels = vec![output.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_output = prepare_cuda_radix_ffi( + &output.0.ciphertext, + &mut output_degrees, + &mut output_noise_levels, + ); + + let mut value_degrees: Vec = value.info.blocks.iter().map(|b| b.degree.get()).collect(); + let mut value_noise_levels: Vec = + value.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let ffi_value = prepare_cuda_radix_ffi(value, &mut value_degrees, &mut value_noise_levels); + + let mut ffi_inputs_degrees: Vec> = Vec::with_capacity(inputs.len()); + let mut ffi_inputs_noise_levels: Vec> = Vec::with_capacity(inputs.len()); + let ffi_inputs: Vec = inputs + .iter() + .map(|ct| { + let degrees = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let noise_levels = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + ffi_inputs_degrees.push(degrees); + ffi_inputs_noise_levels.push(noise_levels); + + prepare_cuda_radix_ffi( + ct.as_ref(), + ffi_inputs_degrees.last_mut().unwrap(), + ffi_inputs_noise_levels.last_mut().unwrap(), + ) + }) + .collect(); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_contains_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_inputs, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_contains_64( + streams.ffi(), + &raw mut ffi_output, + ffi_inputs.as_ptr(), + &raw const ffi_value, + num_inputs, + num_blocks, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_contains_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(&mut output.0.ciphertext, &ffi_output); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_contains_clear< + T: UnsignedInteger, + B: Numeric, + C: CudaIntegerRadixCiphertext, + Clear: DecomposableInto, +>( + streams: &CudaStreams, + output: &mut CudaBooleanBlock, + inputs: &[C], + clear: Clear, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + if !inputs.is_empty() { + assert_eq!( + streams.gpu_indexes[0], + inputs[0].as_ref().d_blocks.0.d_vec.gpu_index(0) + ); + } + + let num_inputs = inputs.len() as u32; + let num_blocks = inputs[0].as_ref().d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + + let h_clear_blocks: Vec = BlockDecomposer::new(clear, num_bits_in_message) + .take(num_blocks as usize) + .map(|block_value| block_value.cast_into()) + .collect(); + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut output_degrees = vec![output.0.ciphertext.info.blocks[0].degree.get()]; + let mut output_noise_levels = vec![output.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_output = prepare_cuda_radix_ffi( + &output.0.ciphertext, + &mut output_degrees, + &mut output_noise_levels, + ); + + let mut ffi_inputs_degrees: Vec> = Vec::with_capacity(inputs.len()); + let mut ffi_inputs_noise_levels: Vec> = Vec::with_capacity(inputs.len()); + let ffi_inputs: Vec = inputs + .iter() + .map(|ct| { + let degrees = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let noise_levels = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + ffi_inputs_degrees.push(degrees); + ffi_inputs_noise_levels.push(noise_levels); + + prepare_cuda_radix_ffi( + ct.as_ref(), + ffi_inputs_degrees.last_mut().unwrap(), + ffi_inputs_noise_levels.last_mut().unwrap(), + ) + }) + .collect(); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_contains_clear_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_inputs, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_contains_clear_64( + streams.ffi(), + &raw mut ffi_output, + ffi_inputs.as_ptr(), + h_clear_blocks.as_ptr(), + num_inputs, + num_blocks, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_contains_clear_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(&mut output.0.ciphertext, &ffi_output); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears< + T: UnsignedInteger, + B: Numeric, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, +>( + streams: &CudaStreams, + output: &mut CudaBooleanBlock, + input: &CudaRadixCiphertext, + clears: &[Clear], + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], input.d_blocks.0.d_vec.gpu_index(0)); + + let num_clears = clears.len() as u32; + let num_blocks = input.d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + + let h_decomposed_cleartexts: Vec = clears + .par_iter() + .flat_map(|input_value| { + BlockDecomposer::new(*input_value, num_bits_in_message) + .take(num_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect::>(); + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut output_degrees = vec![output.0.ciphertext.info.blocks[0].degree.get()]; + let mut output_noise_levels = vec![output.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_output = prepare_cuda_radix_ffi( + &output.0.ciphertext, + &mut output_degrees, + &mut output_noise_levels, + ); + + let mut input_degrees: Vec = input.info.blocks.iter().map(|b| b.degree.get()).collect(); + let mut input_noise_levels: Vec = + input.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_is_in_clears_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_clears, + num_blocks, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_is_in_clears_64( + streams.ffi(), + &raw mut ffi_output, + &raw const ffi_input, + h_decomposed_cleartexts.as_ptr(), + num_clears, + num_blocks, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_is_in_clears_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(&mut output.0.ciphertext, &ffi_output); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_compute_final_index_from_selectors< + T: UnsignedInteger, + B: Numeric, +>( + streams: &CudaStreams, + index_ct: &mut CudaRadixCiphertext, + match_ct: &mut CudaBooleanBlock, + selectors: &[CudaBooleanBlock], + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + + let num_inputs = selectors.len() as u32; + let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32; + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut index_degrees = index_ct + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let mut index_noise_levels = index_ct + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut ffi_index = + prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels); + + let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()]; + let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_match = prepare_cuda_radix_ffi( + &match_ct.0.ciphertext, + &mut match_degrees, + &mut match_noise_levels, + ); + + let mut ffi_selectors_degrees: Vec> = Vec::with_capacity(selectors.len()); + let mut ffi_selectors_noise_levels: Vec> = Vec::with_capacity(selectors.len()); + let ffi_selectors: Vec = selectors + .iter() + .map(|ct| { + let degrees = vec![ct.0.ciphertext.info.blocks[0].degree.get()]; + let noise_levels = vec![ct.0.ciphertext.info.blocks[0].noise_level.0]; + ffi_selectors_degrees.push(degrees); + ffi_selectors_noise_levels.push(noise_levels); + + prepare_cuda_radix_ffi( + &ct.0.ciphertext, + ffi_selectors_degrees.last_mut().unwrap(), + ffi_selectors_noise_levels.last_mut().unwrap(), + ) + }) + .collect(); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_compute_final_index_from_selectors_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_inputs, + num_blocks_index, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_compute_final_index_from_selectors_64( + streams.ffi(), + &raw mut ffi_index, + &raw mut ffi_match, + ffi_selectors.as_ptr(), + num_inputs, + num_blocks_index, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_compute_final_index_from_selectors_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + ); + + update_noise_degree(index_ct, &ffi_index); + update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears< + T: UnsignedInteger, + B: Numeric, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, +>( + streams: &CudaStreams, + index_ct: &mut CudaRadixCiphertext, + match_ct: &mut CudaBooleanBlock, + input: &CudaRadixCiphertext, + clears: &[Clear], + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], input.d_blocks.0.d_vec.gpu_index(0)); + + let num_clears = clears.len() as u32; + let num_blocks = input.d_blocks.lwe_ciphertext_count().0 as u32; + let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + + let h_decomposed_cleartexts: Vec = clears + .par_iter() + .flat_map(|input_value| { + BlockDecomposer::new(*input_value, num_bits_in_message) + .take(num_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect::>(); + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut index_degrees = index_ct + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let mut index_noise_levels = index_ct + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut ffi_index = + prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels); + + let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()]; + let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_match = prepare_cuda_radix_ffi( + &match_ct.0.ciphertext, + &mut match_degrees, + &mut match_noise_levels, + ); + + let mut input_degrees: Vec = input.info.blocks.iter().map(|b| b.degree.get()).collect(); + let mut input_noise_levels: Vec = + input.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_index_in_clears_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_clears, + num_blocks, + num_blocks_index, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_index_in_clears_64( + streams.ffi(), + &raw mut ffi_index, + &raw mut ffi_match, + &raw const ffi_input, + h_decomposed_cleartexts.as_ptr(), + num_clears, + num_blocks, + num_blocks_index, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_index_in_clears_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(index_ct, &ffi_index); + update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears< + T: UnsignedInteger, + B: Numeric, + Clear: UnsignedInteger + DecomposableInto + CastInto + Hash + Sync + Send, +>( + streams: &CudaStreams, + index_ct: &mut CudaRadixCiphertext, + match_ct: &mut CudaBooleanBlock, + input: &CudaRadixCiphertext, + clears: &[Clear], + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], input.d_blocks.0.d_vec.gpu_index(0)); + + let num_bits_in_message = message_modulus.0.ilog2(); + let num_blocks = input.d_blocks.lwe_ciphertext_count().0 as u32; + let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32; + + let unique_elements: Vec<(usize, &Clear)> = clears + .iter() + .enumerate() + .unique_by(|&(_, value)| value) + .collect(); + + let num_unique = unique_elements.len() as u32; + + let h_unique_values: Vec = unique_elements + .par_iter() + .flat_map(|(_, input_value)| { + BlockDecomposer::new(**input_value, num_bits_in_message) + .take(num_blocks as usize) + .map(|block_value: Clear| block_value.cast_into()) + .collect::>() + }) + .collect(); + + let num_packed_blocks = (num_blocks_index as usize).div_ceil(2); + let bits_per_packed_block = 2 * num_bits_in_message; + + let h_unique_indices: Vec = unique_elements + .par_iter() + .flat_map(|(index, _)| { + let val = *index as u64; + (0..num_packed_blocks).into_par_iter().map(move |b| { + let shift = b as u32 * bits_per_packed_block; + if shift >= 64 { + 0 + } else { + (val >> shift) & ((1 << bits_per_packed_block) - 1) + } + }) + }) + .collect(); + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut index_degrees = index_ct + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let mut index_noise_levels = index_ct + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut ffi_index = + prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels); + + let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()]; + let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_match = prepare_cuda_radix_ffi( + &match_ct.0.ciphertext, + &mut match_degrees, + &mut match_noise_levels, + ); + + let mut input_degrees: Vec = input.info.blocks.iter().map(|b| b.degree.get()).collect(); + let mut input_noise_levels: Vec = + input.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_first_index_in_clears_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_unique, + num_blocks, + num_blocks_index, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_first_index_in_clears_64( + streams.ffi(), + &raw mut ffi_index, + &raw mut ffi_match, + &raw const ffi_input, + h_unique_values.as_ptr(), + h_unique_indices.as_ptr(), + num_unique, + num_blocks, + num_blocks_index, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_first_index_in_clears_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(index_ct, &ffi_index); + update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear< + T: UnsignedInteger, + B: Numeric, + C: CudaIntegerRadixCiphertext, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, +>( + streams: &CudaStreams, + index_ct: &mut CudaRadixCiphertext, + match_ct: &mut CudaBooleanBlock, + inputs: &[C], + clear: Clear, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + + let num_inputs = inputs.len() as u32; + let num_blocks = inputs[0].as_ref().d_blocks.lwe_ciphertext_count().0 as u32; + let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32; + let num_bits_in_message = message_modulus.0.ilog2(); + + let h_clear_blocks: Vec = BlockDecomposer::new(clear, num_bits_in_message) + .take(num_blocks as usize) + .map(|block_value| block_value.cast_into()) + .collect(); + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut index_degrees = index_ct + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let mut index_noise_levels = index_ct + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut ffi_index = + prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels); + + let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()]; + let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_match = prepare_cuda_radix_ffi( + &match_ct.0.ciphertext, + &mut match_degrees, + &mut match_noise_levels, + ); + + let mut ffi_inputs_degrees: Vec> = Vec::with_capacity(inputs.len()); + let mut ffi_inputs_noise_levels: Vec> = Vec::with_capacity(inputs.len()); + let ffi_inputs: Vec = inputs + .iter() + .map(|ct| { + let degrees = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let noise_levels = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + ffi_inputs_degrees.push(degrees); + ffi_inputs_noise_levels.push(noise_levels); + + prepare_cuda_radix_ffi( + ct.as_ref(), + ffi_inputs_degrees.last_mut().unwrap(), + ffi_inputs_noise_levels.last_mut().unwrap(), + ) + }) + .collect(); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_first_index_of_clear_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_inputs, + num_blocks, + num_blocks_index, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_first_index_of_clear_64( + streams.ffi(), + &raw mut ffi_index, + &raw mut ffi_match, + ffi_inputs.as_ptr(), + h_clear_blocks.as_ptr(), + num_inputs, + num_blocks, + num_blocks_index, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_first_index_of_clear_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(index_ct, &ffi_index); + update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_first_index_of< + T: UnsignedInteger, + B: Numeric, + C: CudaIntegerRadixCiphertext, +>( + streams: &CudaStreams, + index_ct: &mut CudaRadixCiphertext, + match_ct: &mut CudaBooleanBlock, + inputs: &[C], + value: &CudaRadixCiphertext, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], value.d_blocks.0.d_vec.gpu_index(0)); + + let num_inputs = inputs.len() as u32; + let num_blocks = value.d_blocks.lwe_ciphertext_count().0 as u32; + let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32; + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut index_degrees = index_ct + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let mut index_noise_levels = index_ct + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut ffi_index = + prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels); + + let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()]; + let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_match = prepare_cuda_radix_ffi( + &match_ct.0.ciphertext, + &mut match_degrees, + &mut match_noise_levels, + ); + + let mut value_degrees: Vec = value.info.blocks.iter().map(|b| b.degree.get()).collect(); + let mut value_noise_levels: Vec = + value.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let ffi_value = prepare_cuda_radix_ffi(value, &mut value_degrees, &mut value_noise_levels); + + let mut ffi_inputs_degrees: Vec> = Vec::with_capacity(inputs.len()); + let mut ffi_inputs_noise_levels: Vec> = Vec::with_capacity(inputs.len()); + let ffi_inputs: Vec = inputs + .iter() + .map(|ct| { + let degrees = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let noise_levels = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + ffi_inputs_degrees.push(degrees); + ffi_inputs_noise_levels.push(noise_levels); + + prepare_cuda_radix_ffi( + ct.as_ref(), + ffi_inputs_degrees.last_mut().unwrap(), + ffi_inputs_noise_levels.last_mut().unwrap(), + ) + }) + .collect(); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_first_index_of_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_inputs, + num_blocks, + num_blocks_index, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_first_index_of_64( + streams.ffi(), + &raw mut ffi_index, + &raw mut ffi_match, + ffi_inputs.as_ptr(), + &raw const ffi_value, + num_inputs, + num_blocks, + num_blocks_index, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_first_index_of_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(index_ct, &ffi_index); + update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match); +} + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. +pub(crate) unsafe fn cuda_backend_unchecked_index_of< + T: UnsignedInteger, + B: Numeric, + C: CudaIntegerRadixCiphertext, +>( + streams: &CudaStreams, + index_ct: &mut CudaRadixCiphertext, + match_ct: &mut CudaBooleanBlock, + inputs: &[C], + value: &CudaRadixCiphertext, + bootstrapping_key: &CudaVec, + keyswitch_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + big_lwe_dimension: LweDimension, + small_lwe_dimension: LweDimension, + ks_level: DecompositionLevelCount, + ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + grouping_factor: LweBskGroupingFactor, + ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, +) { + assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0)); + assert_eq!(streams.gpu_indexes[0], value.d_blocks.0.d_vec.gpu_index(0)); + + let num_inputs = inputs.len() as u32; + let num_blocks = value.d_blocks.lwe_ciphertext_count().0 as u32; + let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32; + + let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration); + + let mut index_degrees = index_ct + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let mut index_noise_levels = index_ct + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut ffi_index = + prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels); + + let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()]; + let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0]; + let mut ffi_match = prepare_cuda_radix_ffi( + &match_ct.0.ciphertext, + &mut match_degrees, + &mut match_noise_levels, + ); + + let mut value_degrees: Vec = value.info.blocks.iter().map(|b| b.degree.get()).collect(); + let mut value_noise_levels: Vec = + value.info.blocks.iter().map(|b| b.noise_level.0).collect(); + let ffi_value = prepare_cuda_radix_ffi(value, &mut value_degrees, &mut value_noise_levels); + + let mut ffi_inputs_degrees: Vec> = Vec::with_capacity(inputs.len()); + let mut ffi_inputs_noise_levels: Vec> = Vec::with_capacity(inputs.len()); + let ffi_inputs: Vec = inputs + .iter() + .map(|ct| { + let degrees = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.degree.get()) + .collect(); + let noise_levels = ct + .as_ref() + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + ffi_inputs_degrees.push(degrees); + ffi_inputs_noise_levels.push(noise_levels); + + prepare_cuda_radix_ffi( + ct.as_ref(), + ffi_inputs_degrees.last_mut().unwrap(), + ffi_inputs_noise_levels.last_mut().unwrap(), + ) + }) + .collect(); + + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + + scratch_cuda_unchecked_index_of_64( + streams.ffi(), + std::ptr::addr_of_mut!(mem_ptr), + glwe_dimension.0 as u32, + polynomial_size.0 as u32, + big_lwe_dimension.0 as u32, + small_lwe_dimension.0 as u32, + ks_level.0 as u32, + ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_inputs, + num_blocks, + num_blocks_index, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + true, + noise_reduction_type as u32, + ); + + cuda_unchecked_index_of_64( + streams.ffi(), + &raw mut ffi_index, + &raw mut ffi_match, + ffi_inputs.as_ptr(), + &raw const ffi_value, + num_inputs, + num_blocks, + num_blocks_index, + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + keyswitch_key.ptr.as_ptr(), + ); + + cleanup_cuda_unchecked_index_of_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)); + + update_noise_degree(index_ct, &ffi_index); + update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match); +} diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index e76ae5b1b..e0f7bcef3 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -18,7 +18,7 @@ use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut, cuda_backend_apply_univariate_lut, cuda_backend_cast_to_unsigned, - cuda_backend_compute_prefix_sum_hillis_steele, cuda_backend_extend_radix_with_sign_msb, + cuda_backend_extend_radix_with_sign_msb, cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign, cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign, cuda_backend_trim_radix_blocks_lsb, cuda_backend_trim_radix_blocks_msb, CudaServerKey, PBSType, @@ -1094,134 +1094,6 @@ impl CudaServerKey { ciphertexts } - /// Applies the lookup table on the range of ciphertexts - /// - /// The output must have exactly block_range.len() blocks - pub(crate) fn compute_prefix_sum_hillis_steele( - &self, - output: &mut CudaRadixCiphertext, - generates_or_propagates: &mut CudaRadixCiphertext, - lut: &BivariateLookupTableOwned, - block_range: std::ops::Range, - streams: &CudaStreams, - ) { - if block_range.is_empty() { - return; - } - assert_eq!( - generates_or_propagates.d_blocks.lwe_dimension(), - output.d_blocks.lwe_dimension() - ); - - let lwe_dimension = generates_or_propagates.d_blocks.lwe_dimension(); - let lwe_size = lwe_dimension.to_lwe_size().0; - let num_blocks = block_range.len(); - - let mut generates_or_propagates_slice = generates_or_propagates - .d_blocks - .0 - .d_vec - .as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0) - .unwrap(); - let mut generates_or_propagates_degrees = vec![0; num_blocks]; - let mut generates_or_propagates_noise_levels = vec![0; num_blocks]; - for (i, block_index) in (block_range.clone()).enumerate() { - generates_or_propagates_degrees[i] = - generates_or_propagates.info.blocks[block_index].degree.0; - generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks - [block_index] - .noise_level - .0; - } - let mut output_slice = output - .d_blocks - .0 - .d_vec - .as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0) - .unwrap(); - let mut output_degrees = vec![0_u64; num_blocks]; - let mut output_noise_levels = vec![0_u64; num_blocks]; - unsafe { - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - cuda_backend_compute_prefix_sum_hillis_steele( - streams, - &mut output_slice, - &mut output_degrees, - &mut output_noise_levels, - &mut generates_or_propagates_slice, - &mut generates_or_propagates_degrees, - &mut generates_or_propagates_noise_levels, - lut.acc.acc.as_ref(), - lut.acc.degree.0, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::Classical, - LweBskGroupingFactor(0), - d_bsk.ms_noise_reduction_configuration.as_ref(), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - cuda_backend_compute_prefix_sum_hillis_steele( - streams, - &mut output_slice, - &mut output_degrees, - &mut output_noise_levels, - &mut generates_or_propagates_slice, - &mut generates_or_propagates_degrees, - &mut generates_or_propagates_noise_levels, - lut.acc.acc.as_ref(), - lut.acc.degree.0, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - None, - ); - } - } - } - - for (i, info) in output.info.blocks[block_range.start..block_range.end] - .iter_mut() - .enumerate() - { - info.degree = Degree(output_degrees[i]); - info.noise_level = NoiseLevel(output_noise_levels[i]); - } - for (i, info) in generates_or_propagates.info.blocks[block_range.start..block_range.end] - .iter_mut() - .enumerate() - { - info.degree = Degree(generates_or_propagates_degrees[i]); - info.noise_level = NoiseLevel(generates_or_propagates_noise_levels[i]); - } - } - pub(crate) fn extend_radix_with_sign_msb( &self, ct: &T, diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs index 9c45e5944..26a2b0546 100644 --- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs +++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs @@ -1,98 +1,24 @@ -use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::{LweBskGroupingFactor, UnsignedInteger}; -use crate::integer::block_decomposition::{BlockDecomposer, Decomposable, DecomposableInto}; +use crate::integer::block_decomposition::DecomposableInto; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; -use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext}; -use crate::integer::gpu::server_key::radix::CudaRadixCiphertext; use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey}; use crate::integer::gpu::{ - cuda_backend_aggregate_one_hot_vector, cuda_backend_compute_equality_selectors, - cuda_backend_create_possible_results, cuda_backend_get_unchecked_match_value_or_size_on_gpu, - cuda_backend_get_unchecked_match_value_size_on_gpu, cuda_backend_unchecked_match_value, + cuda_backend_compute_final_index_from_selectors, + cuda_backend_get_unchecked_match_value_or_size_on_gpu, + cuda_backend_get_unchecked_match_value_size_on_gpu, cuda_backend_unchecked_contains, + cuda_backend_unchecked_contains_clear, cuda_backend_unchecked_first_index_in_clears, + cuda_backend_unchecked_first_index_of, cuda_backend_unchecked_first_index_of_clear, + cuda_backend_unchecked_index_in_clears, cuda_backend_unchecked_index_of, + cuda_backend_unchecked_is_in_clears, cuda_backend_unchecked_match_value, cuda_backend_unchecked_match_value_or, PBSType, }; pub use crate::integer::server_key::radix_parallel::MatchValues; use crate::prelude::CastInto; -use itertools::Itertools; -use rayon::prelude::*; use std::hash::Hash; impl CudaServerKey { - #[allow(clippy::unused_self)] - pub(crate) fn convert_selectors_to_unsigned_radix_ciphertext( - &self, - selectors: &[CudaBooleanBlock], - streams: &CudaStreams, - ) -> CudaUnsignedRadixCiphertext { - if selectors.is_empty() { - return self.create_trivial_radix(0, 1, streams); - } - let packed_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list( - selectors - .iter() - .map(|ciphertext| &ciphertext.0.ciphertext.d_blocks), - streams, - ); - let vec_block_info: Vec = selectors - .iter() - .flat_map(|ct| ct.0.ciphertext.info.blocks.clone()) - .collect(); - let radix_info = CudaRadixCiphertextInfo { - blocks: vec_block_info, - }; - CudaIntegerRadixCiphertext::from(CudaRadixCiphertext { - d_blocks: packed_list, - info: radix_info, - }) - } - - pub(crate) fn convert_unsigned_radix_ciphertext_to_selectors( - &self, - ct: &mut CudaUnsignedRadixCiphertext, - streams: &CudaStreams, - ) -> Vec { - let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0; - let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size().0; - let mut unpacked_selectors = Vec::::with_capacity(num_blocks); - for i in 0..num_blocks { - let mut radix_ct: CudaUnsignedRadixCiphertext = - self.create_trivial_radix(0, 1, streams); - let slice_in = ct - .as_mut() - .d_blocks - .0 - .d_vec - .as_mut_slice(i * lwe_size..(i + 1) * lwe_size, 0) - .unwrap(); - let mut slice_out = radix_ct - .as_mut() - .d_blocks - .0 - .d_vec - .as_mut_slice(0..lwe_size, 0) - .unwrap(); - unsafe { - slice_out.copy_from_gpu_async(&slice_in, streams, 0); - streams.synchronize(); - } - let boolean_block = CudaBooleanBlock::from_cuda_radix_ciphertext(radix_ct.into_inner()); - - unpacked_selectors.push(boolean_block); - } - unpacked_selectors - } - - /// `match` an input value to an output value - /// - /// - Input values are not required to span all possible values that `ct` could hold. - /// - /// - The output radix has a number of blocks that depends on the maximum possible output value - /// from the `MatchValues` - /// - /// Returns a boolean block that encrypts `true` if the input `ct` - /// matched one of the possible inputs pub fn unchecked_match_value( &self, ct: &CudaUnsignedRadixCiphertext, @@ -110,21 +36,6 @@ impl CudaServerKey { return (trivial_ct, trivial_bool); } - let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - let num_bits_in_message = self.message_modulus.0.ilog2(); - - let h_match_inputs: Vec = matches - .get_values() - .par_iter() - .map(|(input, _output)| *input) - .flat_map(|input_value| { - BlockDecomposer::new(input_value, num_bits_in_message) - .take(num_input_blocks as usize) - .map(|block_value| block_value.cast_into()) - .collect::>() - }) - .collect::>(); - let max_output_value = matches .get_values() .iter() @@ -135,19 +46,6 @@ impl CudaServerKey { let num_output_unpacked_blocks = self.num_blocks_to_represent_unsigned_value(max_output_value); - let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2) as u32; - - let h_match_outputs: Vec = matches - .get_values() - .par_iter() - .map(|(_input, output)| *output) - .flat_map(|output_value| { - BlockDecomposer::new(output_value, 2 * num_bits_in_message) - .take(num_output_packed_blocks as usize) - .map(|block_value| block_value.cast_into()) - .collect::>() - }) - .collect::>(); let mut result_ct: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(num_output_unpacked_blocks, streams); @@ -155,9 +53,6 @@ impl CudaServerKey { self.create_trivial_zero_radix::(1, streams), ); - let max_output_is_zero = max_output_value == Clear::ZERO; - let num_matches = matches.get_values().len() as u32; - unsafe { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { @@ -166,12 +61,7 @@ impl CudaServerKey { &mut result_ct, &mut result_bool, ct.as_ref(), - &h_match_inputs, - &h_match_outputs, - num_matches, - num_input_blocks, - num_output_packed_blocks, - max_output_is_zero, + matches, self.message_modulus, self.carry_modulus, &d_bsk.d_vec, @@ -199,12 +89,7 @@ impl CudaServerKey { &mut result_ct, &mut result_bool, ct.as_ref(), - &h_match_inputs, - &h_match_outputs, - num_matches, - num_input_blocks, - num_output_packed_blocks, - max_output_is_zero, + matches, self.message_modulus, self.carry_modulus, &d_multibit_bsk.d_vec, @@ -239,33 +124,19 @@ impl CudaServerKey { streams: &CudaStreams, ) -> u64 where - Clear: UnsignedInteger + DecomposableInto + CastInto, + Clear: + UnsignedInteger + DecomposableInto + CastInto + CastInto + Sync + Send, { if matches.get_values().is_empty() { return 0; } - let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - - let max_output_value = matches - .get_values() - .iter() - .copied() - .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr)) - .expect("luts is not empty at this point") - .1; - - let num_output_unpacked_blocks = - self.num_blocks_to_represent_unsigned_value(max_output_value); - let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2) as u32; - - let max_output_is_zero = max_output_value == Clear::ZERO; - let num_matches = matches.get_values().len() as u32; - match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { cuda_backend_get_unchecked_match_value_size_on_gpu( streams, + ct.as_ref(), + matches, d_bsk.glwe_dimension, d_bsk.polynomial_size, self.key_switching_key @@ -282,16 +153,14 @@ impl CudaServerKey { self.message_modulus, self.carry_modulus, PBSType::Classical, - num_matches, - num_input_blocks, - num_output_packed_blocks, - max_output_is_zero, d_bsk.ms_noise_reduction_configuration.as_ref(), ) } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { cuda_backend_get_unchecked_match_value_size_on_gpu( streams, + ct.as_ref(), + matches, d_multibit_bsk.glwe_dimension, d_multibit_bsk.polynomial_size, self.key_switching_key @@ -308,10 +177,6 @@ impl CudaServerKey { self.message_modulus, self.carry_modulus, PBSType::MultiBit, - num_matches, - num_input_blocks, - num_output_packed_blocks, - max_output_is_zero, None, ) } @@ -408,7 +273,7 @@ impl CudaServerKey { streams: &CudaStreams, ) -> CudaUnsignedRadixCiphertext where - Clear: UnsignedInteger + DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + CastInto, { if matches.get_values().is_empty() { let num_blocks = self.num_blocks_to_represent_unsigned_value(or_value); @@ -417,21 +282,6 @@ impl CudaServerKey { return ct; } - let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - let num_bits_in_message = self.message_modulus.0.ilog2(); - - let h_match_inputs: Vec = matches - .get_values() - .par_iter() - .map(|(input, _output)| *input) - .flat_map(|input_value| { - BlockDecomposer::new(input_value, num_bits_in_message) - .take(num_input_blocks as usize) - .map(|block_value| block_value.cast_into()) - .collect::>() - }) - .collect::>(); - let max_output_value_match = matches .get_values() .iter() @@ -444,31 +294,9 @@ impl CudaServerKey { let num_blocks_or = self.num_blocks_to_represent_unsigned_value(or_value); let final_num_blocks = num_blocks_match.max(num_blocks_or); - let num_match_packed_blocks = num_blocks_match.div_ceil(2) as u32; - - let h_match_outputs: Vec = matches - .get_values() - .par_iter() - .map(|(_input, output)| *output) - .flat_map(|output_value| { - BlockDecomposer::new(output_value, 2 * num_bits_in_message) - .take(num_match_packed_blocks as usize) - .map(|block_value| block_value.cast_into()) - .collect::>() - }) - .collect::>(); - - let h_or_value: Vec = BlockDecomposer::new(or_value, num_bits_in_message) - .take(final_num_blocks) - .map(|block_value| block_value.cast_into()) - .collect(); - let mut result: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(final_num_blocks, streams); - let max_output_is_zero = max_output_value_match == Clear::ZERO; - let num_matches = matches.get_values().len() as u32; - unsafe { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { @@ -476,14 +304,8 @@ impl CudaServerKey { streams, &mut result, ct.as_ref(), - &h_match_inputs, - &h_match_outputs, - &h_or_value, - num_matches, - num_input_blocks, - num_match_packed_blocks, - final_num_blocks as u32, - max_output_is_zero, + matches, + or_value, self.message_modulus, self.carry_modulus, &d_bsk.d_vec, @@ -510,14 +332,8 @@ impl CudaServerKey { streams, &mut result, ct.as_ref(), - &h_match_inputs, - &h_match_outputs, - &h_or_value, - num_matches, - num_input_blocks, - num_match_packed_blocks, - final_num_blocks as u32, - max_output_is_zero, + matches, + or_value, self.message_modulus, self.carry_modulus, &d_multibit_bsk.d_vec, @@ -553,35 +369,20 @@ impl CudaServerKey { streams: &CudaStreams, ) -> u64 where - Clear: UnsignedInteger + DecomposableInto + CastInto, + Clear: + UnsignedInteger + DecomposableInto + CastInto + CastInto + Sync + Send, { if matches.get_values().is_empty() { return 0; } - let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - - let max_output_value_match = matches - .get_values() - .iter() - .copied() - .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr)) - .expect("luts is not empty at this point") - .1; - - let num_blocks_match = self.num_blocks_to_represent_unsigned_value(max_output_value_match); - let num_blocks_or = self.num_blocks_to_represent_unsigned_value(or_value); - let final_num_blocks = num_blocks_match.max(num_blocks_or); - - let num_match_packed_blocks = num_blocks_match.div_ceil(2) as u32; - - let max_output_is_zero = max_output_value_match == Clear::ZERO; - let num_matches = matches.get_values().len() as u32; - match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { cuda_backend_get_unchecked_match_value_or_size_on_gpu( streams, + ct.as_ref(), + matches, + or_value, d_bsk.glwe_dimension, d_bsk.polynomial_size, self.key_switching_key @@ -598,17 +399,15 @@ impl CudaServerKey { self.message_modulus, self.carry_modulus, PBSType::Classical, - num_matches, - num_input_blocks, - num_match_packed_blocks, - final_num_blocks as u32, - max_output_is_zero, d_bsk.ms_noise_reduction_configuration.as_ref(), ) } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { cuda_backend_get_unchecked_match_value_or_size_on_gpu( streams, + ct.as_ref(), + matches, + or_value, d_multibit_bsk.glwe_dimension, d_multibit_bsk.polynomial_size, self.key_switching_key @@ -625,11 +424,6 @@ impl CudaServerKey { self.message_modulus, self.carry_modulus, PBSType::MultiBit, - num_matches, - num_input_blocks, - num_match_packed_blocks, - final_num_blocks as u32, - max_output_is_zero, None, ) } @@ -721,15 +515,71 @@ impl CudaServerKey { let d_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams); return CudaBooleanBlock::from_cuda_radix_ciphertext(d_ct.ciphertext); } - //Here It would be better to launch them in parallel maybe using different streams or - // packed them in a vector - let selectors = cts - .iter() - .map(|ct| self.eq(ct, value, streams)) - .collect::>(); - let packed_ct = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); - self.unchecked_is_at_least_one_comparisons_block_true(&packed_ct, streams) + let mut result = CudaBooleanBlock::from_cuda_radix_ciphertext( + self.create_trivial_zero_radix::(1, streams) + .into_inner(), + ); + + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_contains( + streams, + &mut result, + cts, + value.as_ref(), + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_contains( + streams, + &mut result, + cts, + value.as_ref(), + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + result } /// Returns an encrypted `true` if the encrypted `value` is found in the encrypted slice @@ -821,13 +671,71 @@ impl CudaServerKey { ); return trivial_bool; } - let selectors = cts - .iter() - .map(|ct| self.scalar_eq(ct, clear, streams)) - .collect::>(); - let packed_ct = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); - self.unchecked_is_at_least_one_comparisons_block_true(&packed_ct, streams) + let mut result = CudaBooleanBlock::from_cuda_radix_ciphertext( + self.create_trivial_zero_radix::(1, streams) + .into_inner(), + ); + + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_contains_clear( + streams, + &mut result, + cts, + clear, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_contains_clear( + streams, + &mut result, + cts, + clear, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + result } /// Returns an encrypted `true` if the clear `value` is found in the encrypted slice @@ -903,7 +811,7 @@ impl CudaServerKey { ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, { if clears.is_empty() { let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams); @@ -912,10 +820,69 @@ impl CudaServerKey { ); return trivial_bool; } - let selectors = self.compute_equality_selectors(ct, clears.par_iter().copied(), streams); - let blocks_ct = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); - self.unchecked_is_at_least_one_comparisons_block_true(&blocks_ct, streams) + let ct_res: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams); + let mut boolean_res = CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner()); + + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_is_in_clears( + streams, + &mut boolean_res, + ct.as_ref(), + clears, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_is_in_clears( + streams, + &mut boolean_res, + ct.as_ref(), + clears, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + boolean_res } /// Returns an encrypted `true` if the encrypted `value` is found in the clear slice @@ -963,7 +930,7 @@ impl CudaServerKey { ) -> CudaBooleanBlock where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, { let mut tmp_ct; let ct = if ct.block_carries_are_empty() { @@ -983,7 +950,6 @@ impl CudaServerKey { /// /// - clear values in the slice must be unique (otherwise use /// [Self::unchecked_first_index_in_clears]) - /// - If the encrypted value is not in the clear slice, the returned index is 0 pub fn unchecked_index_in_clears( &self, ct: &T, @@ -992,7 +958,7 @@ impl CudaServerKey { ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, { if clears.is_empty() { let trivial_ct2: CudaUnsignedRadixCiphertext = self.create_trivial_radix( @@ -1006,8 +972,80 @@ impl CudaServerKey { ); return (trivial_ct2, trivial_bool); } - let selectors = self.compute_equality_selectors(ct, clears.par_iter().copied(), streams); - self.compute_final_index_from_selectors(selectors, streams) + + let num_clears = clears.len(); + let num_blocks_index = + (num_clears.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; + + let mut index_ct: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(num_blocks_index, streams); + + let trivial_bool = + self.create_trivial_zero_radix::(1, streams); + let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner()); + + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_index_in_clears( + streams, + index_ct.as_mut(), + &mut match_ct, + ct.as_ref(), + clears, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_index_in_clears( + streams, + index_ct.as_mut(), + &mut match_ct, + ct.as_ref(), + clears, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + + (index_ct, match_ct) } /// Returns the encrypted index of the encrypted `value` in the clear slice @@ -1063,7 +1101,7 @@ impl CudaServerKey { ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, { let mut tmp_ct; let ct = if ct.block_carries_are_empty() { @@ -1093,7 +1131,7 @@ impl CudaServerKey { ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto + Hash, + Clear: UnsignedInteger + DecomposableInto + CastInto + Hash + Sync + Send, { if clears.is_empty() { let trivial_ct2: CudaUnsignedRadixCiphertext = self.create_trivial_radix( @@ -1107,34 +1145,79 @@ impl CudaServerKey { ); return (trivial_ct2, trivial_bool); } - let unique_clears = clears - .iter() - .copied() - .enumerate() - .unique_by(|&(_, value)| value) - .collect::>(); - let selectors = self.compute_equality_selectors( - ct, - unique_clears.par_iter().copied().map(|(_, value)| value), - streams, - ); - let selectors2 = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); let num_blocks_result = (clears.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; - let possible_values = self.create_possible_results( - num_blocks_result, - selectors - .into_par_iter() - .zip(unique_clears.into_par_iter().map(|(index, _)| index as u64)), - streams, - ); + let mut index_ct: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(num_blocks_result, streams); - let out_ct = self.aggregate_one_hot_vector(&possible_values, streams); + let trivial_bool = + self.create_trivial_zero_radix::(1, streams); + let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner()); - let block = self.unchecked_is_at_least_one_comparisons_block_true(&selectors2, streams); - (out_ct, block) + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_first_index_in_clears( + streams, + index_ct.as_mut(), + &mut match_ct, + ct.as_ref(), + clears, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_first_index_in_clears( + streams, + index_ct.as_mut(), + &mut match_ct, + ct.as_ref(), + clears, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + + (index_ct, match_ct) } /// Returns the encrypted index of the _first_ occurrence of encrypted `value` in the clear @@ -1190,7 +1273,7 @@ impl CudaServerKey { ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto + Hash, + Clear: UnsignedInteger + DecomposableInto + CastInto + Hash + Sync + Send, { let mut tmp_ct; let ct = if ct.block_carries_are_empty() { @@ -1205,13 +1288,6 @@ impl CudaServerKey { self.unchecked_first_index_in_clears(ct, clears, streams) } - /// Returns the encrypted index of the of encrypted `value` in the ciphertext slice - /// also, it returns an encrypted boolean that is `true` if the encrypted value was found. - /// - /// # Notes - /// - /// - clear values in the slice must be unique (otherwise use [Self::unchecked_first_index_of]) - /// - If the encrypted value is not in the encrypted slice, the returned index is 0 pub fn unchecked_index_of( &self, cts: &[T], @@ -1228,12 +1304,79 @@ impl CudaServerKey { ); return (trivial_ct, trivial_bool); } - let selectors = cts - .iter() - .map(|ct| self.eq(ct, value, streams)) - .collect::>(); - self.compute_final_index_from_selectors(selectors, streams) + let num_inputs = cts.len(); + let num_blocks_index = + (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; + + let mut index_ct: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(num_blocks_index, streams); + + let trivial_bool: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, streams); + let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner()); + + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_index_of( + streams, + index_ct.as_mut(), + &mut match_ct, + cts, + value.as_ref(), + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_index_of( + streams, + index_ct.as_mut(), + &mut match_ct, + cts, + value.as_ref(), + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } + + (index_ct, match_ct) } /// Returns the encrypted index of the of encrypted `value` in the ciphertext slice @@ -1352,7 +1495,7 @@ impl CudaServerKey { .map(|ct| self.scalar_eq(ct, clear, streams)) .collect::>(); - self.compute_final_index_from_selectors(selectors, streams) + self.compute_final_index_from_selectors(&selectors, streams) } /// Returns the encrypted index of the of clear `value` in the ciphertext slice @@ -1446,7 +1589,7 @@ impl CudaServerKey { ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, { if cts.is_empty() { let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams); @@ -1455,34 +1598,80 @@ impl CudaServerKey { ); return (trivial_ct, trivial_bool); } + + let num_inputs = cts.len(); let num_blocks_result = - (cts.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; + (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; - let selectors = cts - .iter() - .map(|ct| self.scalar_eq(ct, clear, streams)) - .collect::>(); + let mut index_ct: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(num_blocks_result, streams); - let packed_selectors = - self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); - let mut only_first_selectors = self.only_keep_first_true(packed_selectors, streams); + let trivial_bool = + self.create_trivial_zero_radix::(1, streams); + let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner()); - let unpacked_selectors = - self.convert_unsigned_radix_ciphertext_to_selectors(&mut only_first_selectors, streams); + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_first_index_of_clear( + streams, + index_ct.as_mut(), + &mut match_ct, + cts, + clear, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_first_index_of_clear( + streams, + index_ct.as_mut(), + &mut match_ct, + cts, + clear, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } - let possible_values = self.create_possible_results( - num_blocks_result, - unpacked_selectors - .into_par_iter() - .enumerate() - .map(|(i, v)| (v, i as u64)), - streams, - ); - let out_ct = self.aggregate_one_hot_vector(&possible_values, streams); - - let block = - self.unchecked_is_at_least_one_comparisons_block_true(&only_first_selectors, streams); - (out_ct, block) + (index_ct, match_ct) } /// Returns the encrypted index of the _first_ occurrence of clear `value` in the ciphertext @@ -1539,7 +1728,7 @@ impl CudaServerKey { ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) where T: CudaIntegerRadixCiphertext, - Clear: DecomposableInto + CastInto, + Clear: UnsignedInteger + DecomposableInto + CastInto + Sync + Send, { let mut tmp_cts = Vec::::with_capacity(cts.len()); @@ -1584,35 +1773,79 @@ impl CudaServerKey { return (trivial_ct, trivial_bool); } + let num_inputs = cts.len(); let num_blocks_result = - (cts.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; + (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; - let selectors = cts - .iter() - .map(|ct| self.eq(ct, value, streams)) - .collect::>(); + let mut index_ct: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(num_blocks_result, streams); - let packed_selectors = - self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); + let trivial_bool = + self.create_trivial_zero_radix::(1, streams); + let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner()); - let mut only_first_selectors = self.only_keep_first_true(packed_selectors, streams); + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + cuda_backend_unchecked_first_index_of( + streams, + index_ct.as_mut(), + &mut match_ct, + cts, + value.as_ref(), + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + PBSType::Classical, + LweBskGroupingFactor(0), + d_bsk.ms_noise_reduction_configuration.as_ref(), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + cuda_backend_unchecked_first_index_of( + streams, + index_ct.as_mut(), + &mut match_ct, + cts, + value.as_ref(), + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key + .input_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + None, + ); + } + } + } - let unpacked_selectors = - self.convert_unsigned_radix_ciphertext_to_selectors(&mut only_first_selectors, streams); - - let possible_values = self.create_possible_results( - num_blocks_result, - unpacked_selectors - .into_par_iter() - .enumerate() - .map(|(i, v)| (v, i as u64)), - streams, - ); - let out_ct = self.aggregate_one_hot_vector(&possible_values, streams); - - let block = - self.unchecked_is_at_least_one_comparisons_block_true(&only_first_selectors, streams); - (out_ct, block) + (index_ct, match_ct) } /// Returns the encrypted index of the _first_ occurrence of encrypted `value` in the ciphertext @@ -1702,106 +1935,32 @@ impl CudaServerKey { fn compute_final_index_from_selectors( &self, - selectors: Vec, + selectors: &[CudaBooleanBlock], streams: &CudaStreams, ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) { - let num_blocks_result = - (selectors.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; + let num_inputs = selectors.len(); + let num_blocks_index = + (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize; - let selectors2 = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams); - let possible_values = self.create_possible_results( - num_blocks_result, - selectors - .into_par_iter() - .enumerate() - .map(|(i, v)| (v, i as u64)), - streams, - ); - let one_hot_vector = self.aggregate_one_hot_vector(&possible_values, streams); + let mut index_ct: CudaUnsignedRadixCiphertext = + self.create_trivial_zero_radix(num_blocks_index, streams); - let block = self.unchecked_is_at_least_one_comparisons_block_true(&selectors2, streams); - - (one_hot_vector, block) - } - - /// Computes the vector of selectors from an input iterator of clear values and an encrypted - /// value - /// - /// Given an iterator of clear values, and an encrypted radix ciphertext, - /// this method will return a vector of encrypted boolean values where - /// each value is either 1 if the ct is equal to the corresponding clear in the iterator - /// otherwise it will be 0. - /// On the GPU after applying many luts the result is stored differently than on the CPU. - /// If we have 4 many luts result is stored contiguosly in memory as follows: - /// [result many lut 1][result many lut 2][result many lut 3][result many lut 4] - /// In this case we need to jump between the results of the many luts to build the final result - /// - /// Requires ct to have empty carries - fn compute_equality_selectors( - &self, - ct: &T, - possible_input_values: Iter, - streams: &CudaStreams, - ) -> Vec - where - T: CudaIntegerRadixCiphertext, - Iter: ParallelIterator, - Clear: Decomposable + CastInto + Send + Sync, - { - assert!( - ct.block_carries_are_empty(), - "internal error: ciphertext carries must be empty" - ); - assert!( - self.carry_modulus.0 >= self.message_modulus.0, - "This function uses many LUTs in a way that requires to have at least as much carry \ - space as message space ({:?} vs {:?})", - self.carry_modulus, - self.message_modulus - ); - - let num_bits_in_message = self.message_modulus.0.ilog2(); - let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - - let clear_values: Vec = possible_input_values.collect(); - let num_possible_values = clear_values.len() as u32; - - if num_possible_values == 0 { - return vec![]; - } - - let h_decomposed_cleartexts: Vec = clear_values - .into_par_iter() - .flat_map(|input_value| { - BlockDecomposer::new(input_value, num_bits_in_message) - .take(num_blocks as usize) - .map(|block_value| block_value.cast_into() as u64) - .collect::>() - }) - .collect::>(); - - let mut result_vec: Vec = (0..num_possible_values) - .map(|_| { - CudaBooleanBlock( - self.create_trivial_zero_radix::(1, streams), - ) - }) - .collect(); + let trivial_bool = + self.create_trivial_zero_radix::(1, streams); + let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner()); unsafe { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { - cuda_backend_compute_equality_selectors( + cuda_backend_compute_final_index_from_selectors( streams, - &mut result_vec, - ct.as_ref(), - &h_decomposed_cleartexts, - num_possible_values, - num_blocks, - self.message_modulus, - self.carry_modulus, + index_ct.as_mut(), + &mut match_ct, + selectors, &d_bsk.d_vec, &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, d_bsk.glwe_dimension, d_bsk.polynomial_size, self.key_switching_key @@ -1820,17 +1979,15 @@ impl CudaServerKey { ); } CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - cuda_backend_compute_equality_selectors( + cuda_backend_compute_final_index_from_selectors( streams, - &mut result_vec, - ct.as_ref(), - &h_decomposed_cleartexts, - num_possible_values, - num_blocks, - self.message_modulus, - self.carry_modulus, + index_ct.as_mut(), + &mut match_ct, + selectors, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, + self.message_modulus, + self.carry_modulus, d_multibit_bsk.glwe_dimension, d_multibit_bsk.polynomial_size, self.key_switching_key @@ -1851,259 +2008,6 @@ impl CudaServerKey { } } - result_vec - } - - /// Creates a vector of radix ciphertext from an iterator that associates encrypted boolean - /// values to clear values. - /// - /// The elements of the resulting vector are zero if the corresponding BooleanBlock encrypted 0, - /// otherwise it encrypts the associated clear value. - /// - /// This is only really useful if only one of the boolean block is known to be non-zero. - /// - /// `num_blocks`: number of blocks (unpacked) needed to represent the biggest clear value - /// - /// - Resulting radix ciphertexts have their block packed, thus they will have ceil (numb_blocks - /// / 2) elements - fn create_possible_results( - &self, - num_blocks: usize, - possible_outputs: Iter, - streams: &CudaStreams, - ) -> Vec - where - T: CudaIntegerRadixCiphertext, - Iter: ParallelIterator, - Clear: Decomposable + CastInto + Send + Sync, - { - assert!( - self.carry_modulus.0 >= self.message_modulus.0, - "As this function packs blocks, it requires to have at least as much carry \ - space as message space ({:?} vs {:?})", - self.carry_modulus, - self.message_modulus - ); - - let num_bits_in_message = self.message_modulus.0.ilog2(); - let num_packed_blocks = num_blocks.div_ceil(2) as u32; - - let collected_outputs: Vec<(CudaBooleanBlock, Clear)> = possible_outputs.collect(); - let num_possible_values = collected_outputs.len(); - - if num_possible_values == 0 { - return vec![]; - } - - let (selectors, clear_values): (Vec, Vec) = - collected_outputs.into_iter().unzip(); - - let h_decomposed_cleartexts: Vec = clear_values - .into_par_iter() - .flat_map(|input_value| { - BlockDecomposer::new(input_value, 2 * num_bits_in_message) - .take(num_packed_blocks as usize) - .map(|block_value| block_value.cast_into() as u64) - .collect::>() - }) - .collect::>(); - - let mut result_vec: Vec = (0..num_possible_values) - .map(|_| self.create_trivial_zero_radix(num_packed_blocks as usize, streams)) - .collect(); - - unsafe { - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - cuda_backend_create_possible_results( - streams, - &mut result_vec, - &selectors, - &h_decomposed_cleartexts, - num_packed_blocks, - self.message_modulus, - self.carry_modulus, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - PBSType::Classical, - LweBskGroupingFactor(0), - d_bsk.ms_noise_reduction_configuration.as_ref(), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - cuda_backend_create_possible_results( - streams, - &mut result_vec, - &selectors, - &h_decomposed_cleartexts, - num_packed_blocks, - self.message_modulus, - self.carry_modulus, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - None, - ); - } - } - } - - result_vec - } - - /// Aggregate/combines a vec of one-hot vector of radix ciphertexts - /// (i.e. at most one of the vector element is non-zero) into single ciphertext - /// containing the non-zero value. - /// - /// The elements in the one hot vector have their block packed. - /// - /// The returned result has non packed blocks - fn aggregate_one_hot_vector(&self, one_hot_vector: &[T], streams: &CudaStreams) -> T - where - T: CudaIntegerRadixCiphertext, - { - if one_hot_vector.is_empty() { - return self.create_trivial_zero_radix(0, streams); - } - - let num_packed_blocks = one_hot_vector[0].as_ref().d_blocks.lwe_ciphertext_count().0; - let mut output_ct: T = self.create_trivial_zero_radix(num_packed_blocks * 2, streams); - - unsafe { - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - cuda_backend_aggregate_one_hot_vector( - streams, - &mut output_ct, - one_hot_vector, - self.message_modulus, - self.carry_modulus, - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - PBSType::Classical, - LweBskGroupingFactor(0), - d_bsk.ms_noise_reduction_configuration.as_ref(), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - cuda_backend_aggregate_one_hot_vector( - streams, - &mut output_ct, - one_hot_vector, - self.message_modulus, - self.carry_modulus, - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key - .input_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - None, - ); - } - } - } - - output_ct - } - - /// Only keeps at most one Ciphertext that encrypts 1 - /// - /// Given a Vec of Ciphertexts where each Ciphertext encrypts 0 or 1 - /// This function will return a Vec of Ciphertext where at most one encryption of 1 is present - /// The first encryption of one is kept - fn only_keep_first_true(&self, values: T, streams: &CudaStreams) -> T - where - T: CudaIntegerRadixCiphertext, - { - let num_ct_blocks = values.as_ref().d_blocks.lwe_ciphertext_count().0; - if num_ct_blocks <= 1 { - return values; - } - const ALREADY_SEEN: u64 = 2; - let lut_fn = self.generate_lookup_table_bivariate(|current, previous| { - if previous == 1 || previous == ALREADY_SEEN { - ALREADY_SEEN - } else { - current - } - }); - - let mut first_true: T = self.create_trivial_zero_radix(num_ct_blocks, streams); - - let mut clone_ct = values.duplicate(streams); - self.compute_prefix_sum_hillis_steele( - first_true.as_mut(), - clone_ct.as_mut(), - &lut_fn, - 0..num_ct_blocks, - streams, - ); - - let lut = self.generate_lookup_table(|x| { - let x = x % self.message_modulus.0; - if x == ALREADY_SEEN { - 0 - } else { - x - } - }); - - let cloned_ct = first_true.duplicate(streams); - self.apply_lookup_table( - first_true.as_mut(), - cloned_ct.as_ref(), - &lut, - 0..num_ct_blocks, - streams, - ); - first_true + (index_ct, match_ct) } }