mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
1 Commits
am/chore/m
...
edm/match_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
960cd20050 |
@@ -753,8 +753,9 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_matches, uint32_t num_input_blocks,
|
||||
uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_match_value_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_result,
|
||||
@@ -793,9 +794,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_matches, uint32_t num_input_blocks,
|
||||
uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
|
||||
uint32_t max_output_is_zero, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t max_output_is_zero, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_match_value_or_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
@@ -812,9 +813,9 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
@@ -832,9 +833,9 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_contains_clear_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
|
||||
@@ -850,9 +851,9 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_is_in_clears_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
@@ -871,8 +872,9 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_index_in_clears_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
@@ -890,8 +892,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_first_index_in_clears_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
@@ -913,8 +916,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_first_index_of_clear_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
@@ -932,8 +936,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_first_index_of_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
@@ -951,8 +956,9 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *index_ct,
|
||||
@@ -972,16 +978,16 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_unchecked_index_of_clear_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
const void *d_scalar_blocks, bool is_scalar_obviously_bigger,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks);
|
||||
const uint64_t *h_clear_val, bool is_scalar_obviously_bigger,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,8 +7,9 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_matches, uint32_t num_input_blocks,
|
||||
uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -18,7 +19,7 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
|
||||
return scratch_cuda_unchecked_match_value<uint64_t>(
|
||||
CudaStreams(streams), (int_unchecked_match_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_matches, num_input_blocks, num_output_packed_blocks,
|
||||
max_output_is_zero, allocate_gpu_memory);
|
||||
max_output_is_zero, match_parallelism, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_match_value_64_async(
|
||||
@@ -62,9 +63,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_matches, uint32_t num_input_blocks,
|
||||
uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
|
||||
uint32_t max_output_is_zero, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t max_output_is_zero, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -75,7 +76,7 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_match_value_or_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_matches, num_input_blocks, num_match_packed_blocks, num_final_blocks,
|
||||
max_output_is_zero, allocate_gpu_memory);
|
||||
max_output_is_zero, match_parallelism, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_match_value_or_64_async(
|
||||
@@ -111,9 +112,9 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -122,7 +123,7 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
|
||||
|
||||
return scratch_cuda_unchecked_contains<uint64_t>(
|
||||
CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_inputs, num_blocks, allocate_gpu_memory);
|
||||
params, num_inputs, num_blocks, match_parallelism, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
|
||||
@@ -161,9 +162,9 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -173,7 +174,7 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
|
||||
return scratch_cuda_unchecked_contains_clear<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, allocate_gpu_memory);
|
||||
num_inputs, num_blocks, match_parallelism, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_contains_clear_64_async(
|
||||
@@ -206,9 +207,9 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
|
||||
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t match_parallelism,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -218,7 +219,7 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
|
||||
return scratch_cuda_unchecked_is_in_clears<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_clears, num_blocks, allocate_gpu_memory);
|
||||
num_clears, num_blocks, match_parallelism, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_is_in_clears_64_async(
|
||||
@@ -252,8 +253,9 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -263,7 +265,8 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
|
||||
return scratch_cuda_unchecked_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
num_clears, num_blocks, num_blocks_index, match_parallelism,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_index_in_clears_64_async(
|
||||
@@ -304,8 +307,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -315,7 +319,8 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
|
||||
return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
num_unique, num_blocks, num_blocks_index, match_parallelism,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_first_index_in_clears_64_async(
|
||||
@@ -356,8 +361,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -367,7 +373,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
|
||||
return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
num_inputs, num_blocks, num_blocks_index, match_parallelism,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_first_index_of_clear_64_async(
|
||||
@@ -408,8 +415,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -419,7 +427,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
|
||||
return scratch_cuda_unchecked_first_index_of<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
num_inputs, num_blocks, num_blocks_index, match_parallelism,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_first_index_of_64_async(
|
||||
@@ -460,8 +469,9 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -470,7 +480,8 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
|
||||
|
||||
return scratch_cuda_unchecked_index_of<uint64_t>(
|
||||
CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
|
||||
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
params, num_inputs, num_blocks, num_blocks_index, match_parallelism,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
|
||||
@@ -513,8 +524,9 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t match_parallelism, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
@@ -524,16 +536,16 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
|
||||
return scratch_cuda_unchecked_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams),
|
||||
(int_unchecked_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
|
||||
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
|
||||
num_inputs, num_blocks, num_blocks_index, match_parallelism,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_unchecked_index_of_clear_64_async(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
|
||||
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
|
||||
const void *d_scalar_blocks, bool is_scalar_obviously_bigger,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
|
||||
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
const uint64_t *h_clear_val, bool is_scalar_obviously_bigger,
|
||||
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks) {
|
||||
PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
|
||||
"different for out-of-place operations");
|
||||
PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
|
||||
@@ -543,9 +555,8 @@ void cuda_unchecked_index_of_clear_64_async(
|
||||
"out-of-place operations");
|
||||
|
||||
host_unchecked_index_of_clear<uint64_t>(
|
||||
CudaStreams(streams), index_ct, match_ct, inputs,
|
||||
(const uint64_t *)d_scalar_blocks, is_scalar_obviously_bigger, num_inputs,
|
||||
num_blocks, num_scalar_blocks, num_blocks_index,
|
||||
CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val,
|
||||
is_scalar_obviously_bigger, num_inputs, num_blocks, num_blocks_index,
|
||||
(int_unchecked_index_of_clear_buffer<uint64_t> *)mem, bsks,
|
||||
(uint64_t *const *)ksks);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1711,6 +1711,7 @@ unsafe extern "C" {
|
||||
num_input_blocks: u32,
|
||||
num_output_packed_blocks: u32,
|
||||
max_output_is_zero: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -1794,6 +1795,7 @@ unsafe extern "C" {
|
||||
num_match_packed_blocks: u32,
|
||||
num_final_blocks: u32,
|
||||
max_output_is_zero: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -1835,6 +1837,7 @@ unsafe extern "C" {
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -1873,6 +1876,7 @@ unsafe extern "C" {
|
||||
grouping_factor: u32,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -1914,6 +1918,7 @@ unsafe extern "C" {
|
||||
grouping_factor: u32,
|
||||
num_clears: u32,
|
||||
num_blocks: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -1956,6 +1961,7 @@ unsafe extern "C" {
|
||||
num_clears: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -2000,6 +2006,7 @@ unsafe extern "C" {
|
||||
num_unique: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -2054,6 +2061,7 @@ unsafe extern "C" {
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -2098,6 +2106,7 @@ unsafe extern "C" {
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -2142,6 +2151,7 @@ unsafe extern "C" {
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -2183,6 +2193,7 @@ unsafe extern "C" {
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
match_parallelism: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -2196,11 +2207,10 @@ unsafe extern "C" {
|
||||
index_ct: *mut CudaRadixCiphertextFFI,
|
||||
match_ct: *mut CudaRadixCiphertextFFI,
|
||||
inputs: *const CudaRadixCiphertextFFI,
|
||||
d_scalar_blocks: *const ffi::c_void,
|
||||
h_clear_val: *const u64,
|
||||
is_scalar_obviously_bigger: bool,
|
||||
num_inputs: u32,
|
||||
num_blocks: u32,
|
||||
num_scalar_blocks: u32,
|
||||
num_blocks_index: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
|
||||
@@ -1265,7 +1265,7 @@ where
|
||||
let size = cuda_key
|
||||
.key
|
||||
.key
|
||||
.get_unchecked_match_value_size_on_gpu(&ct_on_gpu, matches, streams);
|
||||
.get_unchecked_match_value_size_on_gpu(&ct_on_gpu, matches, 1, streams);
|
||||
Ok(size)
|
||||
}
|
||||
#[cfg(feature = "hpu")]
|
||||
|
||||
@@ -8336,6 +8336,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -8480,6 +8481,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
|
||||
num_input_blocks,
|
||||
num_output_packed_blocks,
|
||||
max_output_is_zero as u32,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -8531,6 +8533,7 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu<Clear>(
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
pbs_type: PBSType,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) -> u64
|
||||
where
|
||||
@@ -8579,6 +8582,7 @@ where
|
||||
num_input_blocks,
|
||||
num_output_packed_blocks,
|
||||
max_output_is_zero as u32,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -8673,6 +8677,7 @@ where
|
||||
num_match_packed_blocks,
|
||||
num_output_blocks,
|
||||
max_output_is_zero as u32,
|
||||
1,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -8808,6 +8813,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -8935,6 +8941,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
|
||||
num_match_packed_blocks,
|
||||
num_final_blocks,
|
||||
max_output_is_zero as u32,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -8988,6 +8995,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9058,6 +9066,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains<
|
||||
u32::try_from(grouping_factor.0).unwrap(),
|
||||
num_inputs,
|
||||
num_blocks,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9112,6 +9121,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9188,6 +9198,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
|
||||
u32::try_from(grouping_factor.0).unwrap(),
|
||||
num_inputs,
|
||||
num_blocks,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9241,6 +9252,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9292,6 +9304,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
|
||||
u32::try_from(grouping_factor.0).unwrap(),
|
||||
num_clears,
|
||||
num_blocks,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9346,6 +9359,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9414,6 +9428,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
|
||||
num_clears,
|
||||
num_blocks,
|
||||
num_blocks_index,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9471,6 +9486,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9564,6 +9580,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
|
||||
num_unique,
|
||||
num_blocks,
|
||||
num_blocks_index,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9623,6 +9640,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9710,6 +9728,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
|
||||
num_inputs,
|
||||
num_blocks,
|
||||
num_blocks_index,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9767,6 +9786,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9854,6 +9874,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
|
||||
num_inputs,
|
||||
num_blocks,
|
||||
num_blocks_index,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -9911,6 +9932,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -9998,6 +10020,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of<
|
||||
num_inputs,
|
||||
num_blocks,
|
||||
num_blocks_index,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -10056,6 +10079,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
match_parallelism: u32,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
|
||||
@@ -10076,9 +10100,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
|
||||
.is_some_and(|sub_slice| sub_slice.iter().any(|&scalar_block| scalar_block != 0));
|
||||
|
||||
scalar_blocks.truncate(num_blocks_in_ct as usize);
|
||||
let num_scalar_blocks = u32::try_from(scalar_blocks.len()).unwrap();
|
||||
|
||||
let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, streams, 0);
|
||||
scalar_blocks.resize(num_blocks_in_ct as usize, 0u64);
|
||||
|
||||
let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
|
||||
|
||||
@@ -10152,6 +10174,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
|
||||
num_inputs,
|
||||
num_blocks_in_ct,
|
||||
num_blocks_index,
|
||||
match_parallelism,
|
||||
u32::try_from(message_modulus.0).unwrap(),
|
||||
u32::try_from(carry_modulus.0).unwrap(),
|
||||
pbs_type as u32,
|
||||
@@ -10164,11 +10187,10 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
|
||||
&raw mut ffi_index,
|
||||
&raw mut ffi_match,
|
||||
ffi_inputs.as_ptr(),
|
||||
d_scalar_blocks.as_c_ptr(0),
|
||||
scalar_blocks.as_ptr(),
|
||||
is_scalar_obviously_bigger,
|
||||
num_inputs,
|
||||
num_blocks_in_ct,
|
||||
num_scalar_blocks,
|
||||
num_blocks_index,
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
|
||||
@@ -27,16 +27,22 @@ impl CudaServerKey {
|
||||
streams: &CudaStreams,
|
||||
) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
|
||||
where
|
||||
Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize>,
|
||||
Clear:
|
||||
UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
|
||||
{
|
||||
if matches.get_values().is_empty() {
|
||||
let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
|
||||
let trivial_bool = CudaBooleanBlock::from_cuda_radix_ciphertext(
|
||||
trivial_ct.duplicate(streams).into_inner(),
|
||||
let num_matches = matches.get_values().len();
|
||||
|
||||
if num_matches == 0 {
|
||||
let result_ct: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, streams);
|
||||
let result_bool: CudaBooleanBlock = CudaBooleanBlock(
|
||||
self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
|
||||
);
|
||||
return (trivial_ct, trivial_bool);
|
||||
return (result_ct, result_bool);
|
||||
}
|
||||
|
||||
let match_parallelism = num_matches as u32;
|
||||
|
||||
let num_bits_in_message = self.message_modulus.0.ilog2();
|
||||
let max_output_value = matches
|
||||
.get_values()
|
||||
.iter()
|
||||
@@ -44,13 +50,17 @@ impl CudaServerKey {
|
||||
.max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
|
||||
.expect("luts is not empty at this point")
|
||||
.1;
|
||||
let max_val_u64: u64 = max_output_value.cast_into();
|
||||
let num_output_unpacked_blocks = if max_val_u64 == 0 {
|
||||
1
|
||||
} else {
|
||||
(max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message)
|
||||
};
|
||||
|
||||
let num_output_unpacked_blocks =
|
||||
self.num_blocks_to_represent_unsigned_value(max_output_value);
|
||||
let mut result: CudaUnsignedRadixCiphertext =
|
||||
self.create_trivial_zero_radix(num_output_unpacked_blocks as usize, streams);
|
||||
|
||||
let mut result_ct: CudaUnsignedRadixCiphertext =
|
||||
self.create_trivial_zero_radix(num_output_unpacked_blocks, streams);
|
||||
let mut result_bool: CudaBooleanBlock = CudaBooleanBlock(
|
||||
let mut boolean_result: CudaBooleanBlock = CudaBooleanBlock(
|
||||
self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
|
||||
);
|
||||
|
||||
@@ -63,8 +73,8 @@ impl CudaServerKey {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
cuda_backend_unchecked_match_value(
|
||||
streams,
|
||||
&mut result_ct,
|
||||
&mut result_bool,
|
||||
&mut result,
|
||||
&mut boolean_result,
|
||||
ct.as_ref(),
|
||||
matches,
|
||||
self.message_modulus,
|
||||
@@ -81,14 +91,15 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
cuda_backend_unchecked_match_value(
|
||||
streams,
|
||||
&mut result_ct,
|
||||
&mut result_bool,
|
||||
&mut result,
|
||||
&mut boolean_result,
|
||||
ct.as_ref(),
|
||||
matches,
|
||||
self.message_modulus,
|
||||
@@ -105,19 +116,21 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(result_ct, result_bool)
|
||||
(result, boolean_result)
|
||||
}
|
||||
|
||||
pub fn get_unchecked_match_value_size_on_gpu<Clear>(
|
||||
&self,
|
||||
ct: &CudaUnsignedRadixCiphertext,
|
||||
matches: &MatchValues<Clear>,
|
||||
match_parallelism: u32,
|
||||
streams: &CudaStreams,
|
||||
) -> u64
|
||||
where
|
||||
@@ -150,6 +163,7 @@ impl CudaServerKey {
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
)
|
||||
}
|
||||
@@ -170,6 +184,7 @@ impl CudaServerKey {
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::MultiBit,
|
||||
match_parallelism,
|
||||
None,
|
||||
)
|
||||
}
|
||||
@@ -294,6 +309,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = matches.get_values().len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -317,6 +334,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -341,6 +359,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -509,6 +528,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = cts.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -531,6 +552,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -554,6 +576,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -660,6 +683,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = cts.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -682,6 +707,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -705,6 +731,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -802,6 +829,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = clears.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -824,6 +853,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -847,6 +877,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -957,6 +988,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = clears.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -980,6 +1013,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -1004,6 +1038,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -1124,6 +1159,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = clears.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1147,6 +1184,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -1171,6 +1209,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -1278,6 +1317,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = cts.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1301,6 +1342,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -1325,6 +1367,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -1461,6 +1504,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = cts.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1484,6 +1529,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -1508,6 +1554,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -1633,6 +1680,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = cts.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1656,6 +1705,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -1680,6 +1730,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
@@ -1803,6 +1854,8 @@ impl CudaServerKey {
|
||||
panic!("Only the standard atomic pattern is supported on GPU")
|
||||
};
|
||||
|
||||
let match_parallelism = cts.len() as u32;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
@@ -1826,6 +1879,7 @@ impl CudaServerKey {
|
||||
d_bsk.decomp_base_log,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
match_parallelism,
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
@@ -1850,6 +1904,7 @@ impl CudaServerKey {
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
match_parallelism,
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user