Compare commits

...

1 Commits

Author SHA1 Message Date
Enzo Di Maria
960cd20050 fix(gpu): fix match value parallelization 2026-04-27 15:56:02 +02:00
8 changed files with 1105 additions and 962 deletions

View File

@@ -753,8 +753,9 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_matches, uint32_t num_input_blocks,
uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_match_value_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_result,
@@ -793,9 +794,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_matches, uint32_t num_input_blocks,
uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
uint32_t max_output_is_zero, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t max_output_is_zero, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_match_value_or_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
@@ -812,9 +813,9 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
@@ -832,9 +833,9 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_contains_clear_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
@@ -850,9 +851,9 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t num_clears, uint32_t num_blocks, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_is_in_clears_64_async(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *output,
@@ -871,8 +872,9 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_index_in_clears_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -890,8 +892,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_first_index_in_clears_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -913,8 +916,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_first_index_of_clear_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -932,8 +936,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_first_index_of_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -951,8 +956,9 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *index_ct,
@@ -972,16 +978,16 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_unchecked_index_of_clear_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const void *d_scalar_blocks, bool is_scalar_obviously_bigger,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
void *const *ksks);
const uint64_t *h_clear_val, bool is_scalar_obviously_bigger,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
int8_t *mem, void *const *bsks, void *const *ksks);
void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

File diff suppressed because it is too large Load Diff

View File

@@ -7,8 +7,9 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_matches, uint32_t num_input_blocks,
uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,7 +19,7 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
return scratch_cuda_unchecked_match_value<uint64_t>(
CudaStreams(streams), (int_unchecked_match_buffer<uint64_t> **)mem_ptr,
params, num_matches, num_input_blocks, num_output_packed_blocks,
max_output_is_zero, allocate_gpu_memory);
max_output_is_zero, match_parallelism, allocate_gpu_memory);
}
void cuda_unchecked_match_value_64_async(
@@ -62,9 +63,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_matches, uint32_t num_input_blocks,
uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
uint32_t max_output_is_zero, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t max_output_is_zero, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -75,7 +76,7 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
CudaStreams(streams),
(int_unchecked_match_value_or_buffer<uint64_t> **)mem_ptr, params,
num_matches, num_input_blocks, num_match_packed_blocks, num_final_blocks,
max_output_is_zero, allocate_gpu_memory);
max_output_is_zero, match_parallelism, allocate_gpu_memory);
}
void cuda_unchecked_match_value_or_64_async(
@@ -111,9 +112,9 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -122,7 +123,7 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
return scratch_cuda_unchecked_contains<uint64_t>(
CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, allocate_gpu_memory);
params, num_inputs, num_blocks, match_parallelism, allocate_gpu_memory);
}
void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
@@ -161,9 +162,9 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -173,7 +174,7 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
return scratch_cuda_unchecked_contains_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, allocate_gpu_memory);
num_inputs, num_blocks, match_parallelism, allocate_gpu_memory);
}
void cuda_unchecked_contains_clear_64_async(
@@ -206,9 +207,9 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t num_clears, uint32_t num_blocks, uint32_t match_parallelism,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -218,7 +219,7 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
return scratch_cuda_unchecked_is_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_clears, num_blocks, allocate_gpu_memory);
num_clears, num_blocks, match_parallelism, allocate_gpu_memory);
}
void cuda_unchecked_is_in_clears_64_async(
@@ -252,8 +253,9 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -263,7 +265,8 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
return scratch_cuda_unchecked_index_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
num_clears, num_blocks, num_blocks_index, match_parallelism,
allocate_gpu_memory);
}
void cuda_unchecked_index_in_clears_64_async(
@@ -304,8 +307,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -315,7 +319,8 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
num_unique, num_blocks, num_blocks_index, match_parallelism,
allocate_gpu_memory);
}
void cuda_unchecked_first_index_in_clears_64_async(
@@ -356,8 +361,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -367,7 +373,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
num_inputs, num_blocks, num_blocks_index, match_parallelism,
allocate_gpu_memory);
}
void cuda_unchecked_first_index_of_clear_64_async(
@@ -408,8 +415,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -419,7 +427,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
return scratch_cuda_unchecked_first_index_of<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
num_inputs, num_blocks, num_blocks_index, match_parallelism,
allocate_gpu_memory);
}
void cuda_unchecked_first_index_of_64_async(
@@ -460,8 +469,9 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -470,7 +480,8 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
return scratch_cuda_unchecked_index_of<uint64_t>(
CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
params, num_inputs, num_blocks, num_blocks_index, match_parallelism,
allocate_gpu_memory);
}
void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
@@ -513,8 +524,9 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
uint32_t match_parallelism, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -524,16 +536,16 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
return scratch_cuda_unchecked_index_of_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
num_inputs, num_blocks, num_blocks_index, match_parallelism,
allocate_gpu_memory);
}
void cuda_unchecked_index_of_clear_64_async(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const void *d_scalar_blocks, bool is_scalar_obviously_bigger,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
void *const *ksks) {
const uint64_t *h_clear_val, bool is_scalar_obviously_bigger,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
int8_t *mem, void *const *bsks, void *const *ksks) {
PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
"different for out-of-place operations");
PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
@@ -543,9 +555,8 @@ void cuda_unchecked_index_of_clear_64_async(
"out-of-place operations");
host_unchecked_index_of_clear<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs,
(const uint64_t *)d_scalar_blocks, is_scalar_obviously_bigger, num_inputs,
num_blocks, num_scalar_blocks, num_blocks_index,
CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val,
is_scalar_obviously_bigger, num_inputs, num_blocks, num_blocks_index,
(int_unchecked_index_of_clear_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}

File diff suppressed because it is too large Load Diff

View File

@@ -1711,6 +1711,7 @@ unsafe extern "C" {
num_input_blocks: u32,
num_output_packed_blocks: u32,
max_output_is_zero: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -1794,6 +1795,7 @@ unsafe extern "C" {
num_match_packed_blocks: u32,
num_final_blocks: u32,
max_output_is_zero: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -1835,6 +1837,7 @@ unsafe extern "C" {
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -1873,6 +1876,7 @@ unsafe extern "C" {
grouping_factor: u32,
num_inputs: u32,
num_blocks: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -1914,6 +1918,7 @@ unsafe extern "C" {
grouping_factor: u32,
num_clears: u32,
num_blocks: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -1956,6 +1961,7 @@ unsafe extern "C" {
num_clears: u32,
num_blocks: u32,
num_blocks_index: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -2000,6 +2006,7 @@ unsafe extern "C" {
num_unique: u32,
num_blocks: u32,
num_blocks_index: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -2054,6 +2061,7 @@ unsafe extern "C" {
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -2098,6 +2106,7 @@ unsafe extern "C" {
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -2142,6 +2151,7 @@ unsafe extern "C" {
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -2183,6 +2193,7 @@ unsafe extern "C" {
num_inputs: u32,
num_blocks: u32,
num_blocks_index: u32,
match_parallelism: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
@@ -2196,11 +2207,10 @@ unsafe extern "C" {
index_ct: *mut CudaRadixCiphertextFFI,
match_ct: *mut CudaRadixCiphertextFFI,
inputs: *const CudaRadixCiphertextFFI,
d_scalar_blocks: *const ffi::c_void,
h_clear_val: *const u64,
is_scalar_obviously_bigger: bool,
num_inputs: u32,
num_blocks: u32,
num_scalar_blocks: u32,
num_blocks_index: u32,
mem: *mut i8,
bsks: *const *mut ffi::c_void,

View File

@@ -1265,7 +1265,7 @@ where
let size = cuda_key
.key
.key
.get_unchecked_match_value_size_on_gpu(&ct_on_gpu, matches, streams);
.get_unchecked_match_value_size_on_gpu(&ct_on_gpu, matches, 1, streams);
Ok(size)
}
#[cfg(feature = "hpu")]

View File

@@ -8336,6 +8336,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -8480,6 +8481,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
num_input_blocks,
num_output_packed_blocks,
max_output_is_zero as u32,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -8531,6 +8533,7 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu<Clear>(
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
pbs_type: PBSType,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) -> u64
where
@@ -8579,6 +8582,7 @@ where
num_input_blocks,
num_output_packed_blocks,
max_output_is_zero as u32,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -8673,6 +8677,7 @@ where
num_match_packed_blocks,
num_output_blocks,
max_output_is_zero as u32,
1,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -8808,6 +8813,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -8935,6 +8941,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
num_match_packed_blocks,
num_final_blocks,
max_output_is_zero as u32,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -8988,6 +8995,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9058,6 +9066,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains<
u32::try_from(grouping_factor.0).unwrap(),
num_inputs,
num_blocks,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9112,6 +9121,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9188,6 +9198,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
u32::try_from(grouping_factor.0).unwrap(),
num_inputs,
num_blocks,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9241,6 +9252,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9292,6 +9304,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
u32::try_from(grouping_factor.0).unwrap(),
num_clears,
num_blocks,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9346,6 +9359,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9414,6 +9428,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
num_clears,
num_blocks,
num_blocks_index,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9471,6 +9486,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9564,6 +9580,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
num_unique,
num_blocks,
num_blocks_index,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9623,6 +9640,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9710,6 +9728,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
num_inputs,
num_blocks,
num_blocks_index,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9767,6 +9786,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9854,6 +9874,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
num_inputs,
num_blocks,
num_blocks_index,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -9911,6 +9932,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9998,6 +10020,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of<
num_inputs,
num_blocks,
num_blocks_index,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -10056,6 +10079,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
pbs_base_log: DecompositionBaseLog,
pbs_type: PBSType,
grouping_factor: LweBskGroupingFactor,
match_parallelism: u32,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -10076,9 +10100,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
.is_some_and(|sub_slice| sub_slice.iter().any(|&scalar_block| scalar_block != 0));
scalar_blocks.truncate(num_blocks_in_ct as usize);
let num_scalar_blocks = u32::try_from(scalar_blocks.len()).unwrap();
let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, streams, 0);
scalar_blocks.resize(num_blocks_in_ct as usize, 0u64);
let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
@@ -10152,6 +10174,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
num_inputs,
num_blocks_in_ct,
num_blocks_index,
match_parallelism,
u32::try_from(message_modulus.0).unwrap(),
u32::try_from(carry_modulus.0).unwrap(),
pbs_type as u32,
@@ -10164,11 +10187,10 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
&raw mut ffi_index,
&raw mut ffi_match,
ffi_inputs.as_ptr(),
d_scalar_blocks.as_c_ptr(0),
scalar_blocks.as_ptr(),
is_scalar_obviously_bigger,
num_inputs,
num_blocks_in_ct,
num_scalar_blocks,
num_blocks_index,
mem_ptr,
bootstrapping_key.ptr.as_ptr(),

View File

@@ -27,16 +27,22 @@ impl CudaServerKey {
streams: &CudaStreams,
) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
where
Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize>,
Clear:
UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
{
if matches.get_values().is_empty() {
let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
let trivial_bool = CudaBooleanBlock::from_cuda_radix_ciphertext(
trivial_ct.duplicate(streams).into_inner(),
let num_matches = matches.get_values().len();
if num_matches == 0 {
let result_ct: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, streams);
let result_bool: CudaBooleanBlock = CudaBooleanBlock(
self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
);
return (trivial_ct, trivial_bool);
return (result_ct, result_bool);
}
let match_parallelism = num_matches as u32;
let num_bits_in_message = self.message_modulus.0.ilog2();
let max_output_value = matches
.get_values()
.iter()
@@ -44,13 +50,17 @@ impl CudaServerKey {
.max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
.expect("luts is not empty at this point")
.1;
let max_val_u64: u64 = max_output_value.cast_into();
let num_output_unpacked_blocks = if max_val_u64 == 0 {
1
} else {
(max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message)
};
let num_output_unpacked_blocks =
self.num_blocks_to_represent_unsigned_value(max_output_value);
let mut result: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(num_output_unpacked_blocks as usize, streams);
let mut result_ct: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(num_output_unpacked_blocks, streams);
let mut result_bool: CudaBooleanBlock = CudaBooleanBlock(
let mut boolean_result: CudaBooleanBlock = CudaBooleanBlock(
self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
);
@@ -63,8 +73,8 @@ impl CudaServerKey {
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_backend_unchecked_match_value(
streams,
&mut result_ct,
&mut result_bool,
&mut result,
&mut boolean_result,
ct.as_ref(),
matches,
self.message_modulus,
@@ -81,14 +91,15 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
cuda_backend_unchecked_match_value(
streams,
&mut result_ct,
&mut result_bool,
&mut result,
&mut boolean_result,
ct.as_ref(),
matches,
self.message_modulus,
@@ -105,19 +116,21 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
}
}
(result_ct, result_bool)
(result, boolean_result)
}
pub fn get_unchecked_match_value_size_on_gpu<Clear>(
&self,
ct: &CudaUnsignedRadixCiphertext,
matches: &MatchValues<Clear>,
match_parallelism: u32,
streams: &CudaStreams,
) -> u64
where
@@ -150,6 +163,7 @@ impl CudaServerKey {
self.message_modulus,
self.carry_modulus,
PBSType::Classical,
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
)
}
@@ -170,6 +184,7 @@ impl CudaServerKey {
self.message_modulus,
self.carry_modulus,
PBSType::MultiBit,
match_parallelism,
None,
)
}
@@ -294,6 +309,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = matches.get_values().len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -317,6 +334,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -341,6 +359,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -509,6 +528,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = cts.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -531,6 +552,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -554,6 +576,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -660,6 +683,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = cts.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -682,6 +707,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -705,6 +731,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -802,6 +829,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = clears.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -824,6 +853,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -847,6 +877,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -957,6 +988,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = clears.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -980,6 +1013,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -1004,6 +1038,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -1124,6 +1159,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = clears.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1147,6 +1184,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -1171,6 +1209,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -1278,6 +1317,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = cts.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1301,6 +1342,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -1325,6 +1367,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -1461,6 +1504,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = cts.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1484,6 +1529,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -1508,6 +1554,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -1633,6 +1680,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = cts.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1656,6 +1705,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -1680,6 +1730,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}
@@ -1803,6 +1854,8 @@ impl CudaServerKey {
panic!("Only the standard atomic pattern is supported on GPU")
};
let match_parallelism = cts.len() as u32;
unsafe {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1826,6 +1879,7 @@ impl CudaServerKey {
d_bsk.decomp_base_log,
PBSType::Classical,
LweBskGroupingFactor(0),
match_parallelism,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
@@ -1850,6 +1904,7 @@ impl CudaServerKey {
d_multibit_bsk.decomp_base_log,
PBSType::MultiBit,
d_multibit_bsk.grouping_factor,
match_parallelism,
None,
);
}