diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 6aa437374..e3ec9d274 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -1066,8 +1066,18 @@ template struct int_noise_squashing_lut { release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp_lwe_before_ks, gpu_memory_allocated); for (int i = 0; i < pbs_buffer.size(); i++) { - cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i], - &pbs_buffer[i]); + switch (params.pbs_type) { + case MULTI_BIT: + cleanup_cuda_multi_bit_programmable_bootstrap_128( + streams[i], gpu_indexes[i], &pbs_buffer[i]); + break; + case CLASSICAL: + cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i], + &pbs_buffer[i]); + break; + default: + PANIC("Cuda error (PBS): unknown PBS type. ") + } cuda_synchronize_stream(streams[i], gpu_indexes[i]); } if (lwe_aligned_gather_vec.size() > 0) { diff --git a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h index 68f2e2719..6679ed09c 100644 --- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h @@ -47,12 +47,11 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64( void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_output_indexes, void const *lut_vector, - void const *lut_vector_indexes, void const *lwe_array_in, - void const *lwe_input_indexes, void const *bootstrapping_key, - int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, - uint32_t lut_stride); + void const *lwe_array_in, void const *lwe_input_indexes, + void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride); void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream, const uint32_t gpu_index, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index 52d58f1ff..d64722c2d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -347,18 +347,12 @@ void execute_pbs_async( auto current_lwe_input_indexes = get_variant_element(lwe_input_indexes, i); - int gpu_offset = - get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count); - auto d_lut_vector_indexes = - lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset); - cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( streams[i], gpu_indexes[i], current_lwe_array_out, - current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, - current_lwe_array_in, current_lwe_input_indexes, - bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, num_many_lut, lut_stride); + current_lwe_output_indexes, lut_vec[i], current_lwe_array_in, + current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], + lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, + base_log, level_count, num_inputs_on_gpu, num_many_lut, lut_stride); } break; case CLASSICAL: diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu index eedcaa87a..12de09b80 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu @@ -120,8 +120,8 @@ template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, - InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, - InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes, + __uint128_t const *bootstrapping_key, pbs_buffer_128 *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -131,45 +131,45 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( case 256: host_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 512: host_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 1024: host_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 2048: host_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 4096: host_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -182,8 +182,8 @@ template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, - InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, - InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes, + __uint128_t const *bootstrapping_key, pbs_buffer_128 *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -194,46 +194,46 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( host_cg_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 512: host_cg_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 1024: host_cg_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 2048: host_cg_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; case 4096: host_cg_multi_bit_programmable_bootstrap_128>( static_cast(stream), gpu_index, lwe_array_out, - lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, - lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_many_lut, lut_stride); + lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes, + bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, + polynomial_size, grouping_factor, base_log, level_count, num_samples, + num_many_lut, lut_stride); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -245,12 +245,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( void *stream, uint32_t gpu_index, void *lwe_array_out, void const *lwe_output_indexes, void const *lut_vector, - void const *lut_vector_indexes, void const *lwe_array_in, - void const *lwe_input_indexes, void const *bootstrapping_key, - int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut, - uint32_t lut_stride) { + void const *lwe_array_in, void const *lwe_input_indexes, + void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, + uint32_t base_log, uint32_t level_count, uint32_t num_samples, + uint32_t num_many_lut, uint32_t lut_stride) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be <= 64") @@ -263,7 +262,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), - static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, @@ -276,7 +274,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128( stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out), static_cast(lwe_output_indexes), static_cast(lut_vector), - static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh index bac5bd8c2..f421f6ebb 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh @@ -136,7 +136,6 @@ __global__ void __launch_bounds__(params::degree / params::opt) const InputTorus *__restrict__ lwe_array_in, const InputTorus *__restrict__ lwe_input_indexes, const __uint128_t *__restrict__ lut_vector, - const InputTorus *__restrict__ lut_vector_indexes, __uint128_t *global_accumulator, double *global_accumulator_fft, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, @@ -169,8 +168,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) auto block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; - auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] * - params::degree * (glwe_dimension + 1)]; + auto block_lut_vector = lut_vector; auto global_slice = &global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * @@ -368,7 +366,6 @@ __global__ void __launch_bounds__(params::degree / params::opt) __uint128_t *lwe_array_out, const InputTorus *__restrict__ lwe_output_indexes, const __uint128_t *__restrict__ lut_vector, - const InputTorus *__restrict__ lut_vector_indexes, const InputTorus *__restrict__ lwe_array_in, const InputTorus *__restrict__ lwe_input_indexes, const double *__restrict__ keybundle_array, double *join_buffer, @@ -409,8 +406,7 @@ __global__ void __launch_bounds__(params::degree / params::opt) auto block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; - auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] * - params::degree * (glwe_dimension + 1)]; + auto block_lut_vector = lut_vector; auto block_join_buffer = &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) * @@ -591,8 +587,7 @@ __host__ void execute_compute_keybundle_128( template __host__ void execute_step_one_128( cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector, - InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, - InputTorus const *lwe_input_indexes, + InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes, pbs_buffer_128 *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) { @@ -618,27 +613,26 @@ __host__ void execute_step_one_128( device_multi_bit_programmable_bootstrap_accumulate_step_one_128< InputTorus, params, NOSM, is_first_iter> <<>>( - lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes, - global_accumulator, global_accumulator_fft, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, d_mem, + lwe_array_in, lwe_input_indexes, lut_vector, global_accumulator, + global_accumulator_fft, lwe_dimension, glwe_dimension, + polynomial_size, base_log, level_count, d_mem, full_sm_accumulate_step_one); else if (max_shared_memory < full_sm_accumulate_step_one) device_multi_bit_programmable_bootstrap_accumulate_step_one_128< InputTorus, params, PARTIALSM, is_first_iter> <<>>(lwe_array_in, lwe_input_indexes, lut_vector, - lut_vector_indexes, global_accumulator, - global_accumulator_fft, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, d_mem, - partial_sm_accumulate_step_one); + global_accumulator, global_accumulator_fft, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count, + d_mem, partial_sm_accumulate_step_one); else device_multi_bit_programmable_bootstrap_accumulate_step_one_128< InputTorus, params, FULLSM, is_first_iter> <<>>(lwe_array_in, lwe_input_indexes, lut_vector, - lut_vector_indexes, global_accumulator, - global_accumulator_fft, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, d_mem, 0); + global_accumulator, global_accumulator_fft, lwe_dimension, + glwe_dimension, polynomial_size, base_log, level_count, + d_mem, 0); check_cuda_error(cudaGetLastError()); } @@ -691,8 +685,8 @@ template __host__ void host_multi_bit_programmable_bootstrap_128( cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, - InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, - InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes, + __uint128_t const *bootstrapping_key, pbs_buffer_128 *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -717,14 +711,14 @@ __host__ void host_multi_bit_programmable_bootstrap_128( (j + lwe_offset) + 1 == (lwe_dimension / grouping_factor); if (is_first_iter) { execute_step_one_128( - stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, buffer, num_samples, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count); + stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes, + buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, + base_log, level_count); } else { execute_step_one_128( - stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, buffer, num_samples, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count); + stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes, + buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, + base_log, level_count); } if (is_last_iter) { @@ -745,9 +739,8 @@ __host__ void host_multi_bit_programmable_bootstrap_128( template __host__ void execute_cg_external_product_loop_128( cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector, - InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, - InputTorus const *lwe_input_indexes, __uint128_t *lwe_array_out, - InputTorus const *lwe_output_indexes, + InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes, + __uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes, pbs_buffer_128 *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, @@ -780,46 +773,45 @@ __host__ void execute_cg_external_product_loop_128( auto global_accumulator = buffer->global_accumulator; auto join_buffer = buffer->global_join_buffer; - void *kernel_args[22]; + void *kernel_args[21]; kernel_args[0] = &lwe_array_out; kernel_args[1] = &lwe_output_indexes; kernel_args[2] = &lut_vector; - kernel_args[3] = &lut_vector_indexes; - kernel_args[4] = &lwe_array_in; - kernel_args[5] = &lwe_input_indexes; - kernel_args[6] = &keybundle_fft; - kernel_args[7] = &join_buffer; - kernel_args[8] = &global_accumulator; - kernel_args[9] = &lwe_dimension; - kernel_args[10] = &glwe_dimension; - kernel_args[11] = &polynomial_size; - kernel_args[12] = &base_log; - kernel_args[13] = &level_count; - kernel_args[14] = &grouping_factor; - kernel_args[15] = &lwe_offset; - kernel_args[16] = &chunk_size; - kernel_args[17] = &keybundle_size_per_input; - kernel_args[18] = &d_mem; - kernel_args[20] = &num_many_lut; - kernel_args[21] = &lut_stride; + kernel_args[3] = &lwe_array_in; + kernel_args[4] = &lwe_input_indexes; + kernel_args[5] = &keybundle_fft; + kernel_args[6] = &join_buffer; + kernel_args[7] = &global_accumulator; + kernel_args[8] = &lwe_dimension; + kernel_args[9] = &glwe_dimension; + kernel_args[10] = &polynomial_size; + kernel_args[11] = &base_log; + kernel_args[12] = &level_count; + kernel_args[13] = &grouping_factor; + kernel_args[14] = &lwe_offset; + kernel_args[15] = &chunk_size; + kernel_args[16] = &keybundle_size_per_input; + kernel_args[17] = &d_mem; + kernel_args[19] = &num_many_lut; + kernel_args[20] = &lut_stride; dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count); dim3 thds(polynomial_size / params::opt, 1, 1); if (max_shared_memory < partial_dm) { - kernel_args[19] = &full_dm; + kernel_args[18] = &full_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128< InputTorus, params, NOSM>, grid_accumulate, thds, (void **)kernel_args, 0, stream)); } else if (max_shared_memory < full_dm) { - kernel_args[19] = &partial_dm; + kernel_args[18] = &partial_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128< InputTorus, params, PARTIALSM>, grid_accumulate, thds, (void **)kernel_args, partial_sm, stream)); } else { - kernel_args[19] = &no_dm; + kernel_args[18] = &no_dm; check_cuda_error(cudaLaunchCooperativeKernel( (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128< InputTorus, params, FULLSM>, @@ -831,8 +823,8 @@ template __host__ void host_cg_multi_bit_programmable_bootstrap_128( cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector, - InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in, - InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key, + InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes, + __uint128_t const *bootstrapping_key, pbs_buffer_128 *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, @@ -851,11 +843,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap_128( // Accumulate execute_cg_external_product_loop_128( - stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, - lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, - num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_offset, num_many_lut, - lut_stride); + stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes, + lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, + glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, + lwe_offset, num_many_lut, lut_stride); } } diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index b664958a1..b061b5c16 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -2698,7 +2698,6 @@ unsafe extern "C" { lwe_array_out: *mut ffi::c_void, lwe_output_indexes: *const ffi::c_void, lut_vector: *const ffi::c_void, - lut_vector_indexes: *const ffi::c_void, lwe_array_in: *const ffi::c_void, lwe_input_indexes: *const ffi::c_void, bootstrapping_key: *const ffi::c_void, diff --git a/tfhe-benchmark/benches/high_level_api/noise_squash.rs b/tfhe-benchmark/benches/high_level_api/noise_squash.rs index 54d1bbf54..9378255db 100644 --- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs +++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs @@ -1,8 +1,12 @@ +#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] +use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + #[cfg(feature = "gpu")] use benchmark::params_aliases::{ BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, }; #[cfg(not(feature = "gpu"))] @@ -25,8 +29,12 @@ use tfhe::prelude::*; #[cfg(feature = "gpu")] use tfhe::core_crypto::gpu::get_number_of_gpus; +use tfhe::shortint::parameters::{ + CompressionParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters, +}; +use tfhe::shortint::PBSParameters; #[cfg(feature = "gpu")] -use tfhe::{set_server_key, GpuIndex}; +use tfhe::GpuIndex; use tfhe::{ ClientKey, CompressedCiphertextListBuilder, CompressedServerKey, CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14, @@ -36,18 +44,42 @@ use tfhe::{ fn bench_sns_only_fhe_type( c: &mut Criterion, - client_key: &ClientKey, + params: ( + PBSParameters, + NoiseSquashingParameters, + NoiseSquashingCompressionParameters, + CompressionParameters, + ), type_name: &str, num_bits: usize, ) where FheType: FheEncrypt + Send + Sync, FheType: SquashNoise, { + let (param, noise_param, _, _) = params; + + use tfhe::{set_server_key, ConfigBuilder}; + let config = ConfigBuilder::with_custom_parameters(param) + .enable_noise_squashing(noise_param) + .build(); + let client_key = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&client_key); + + #[cfg(feature = "gpu")] + set_server_key(compressed_sks.decompress_to_gpu()); + + #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] + { + let decompressed_sks = compressed_sks.decompress(); + rayon::broadcast(|_| set_server_key(decompressed_sks.clone())); + set_server_key(decompressed_sks); + } + let mut bench_group = c.benchmark_group(type_name); let bench_id_prefix = if cfg!(feature = "gpu") { - "hlapi::cuda" + format!("hlapi::cuda::{}", noise_param.name()) } else { - "hlapi" + "hlapi".to_string() }; let bench_id_suffix = format!("noise_squash::{type_name}"); @@ -60,9 +92,9 @@ fn bench_sns_only_fhe_type( bench_id = format!("{bench_id_prefix}::{bench_id_suffix}"); #[cfg(feature = "gpu")] - configure_gpu(client_key); + configure_gpu(&client_key); - let input = FheType::encrypt(rng.gen(), client_key); + let input = FheType::encrypt(rng.gen(), &client_key); bench_group.bench_function(&bench_id, |b| { b.iter(|| { @@ -82,7 +114,7 @@ fn bench_sns_only_fhe_type( bench_group.throughput(Throughput::Elements(elements)); println!("elements: {elements}"); let gpu_count = get_number_of_gpus() as usize; - let compressed_server_key = CompressedServerKey::new(client_key); + let compressed_server_key = CompressedServerKey::new(&client_key); let sks_vec = (0..gpu_count) .map(|i| { compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32)) @@ -92,7 +124,7 @@ fn bench_sns_only_fhe_type( bench_group.bench_function(&bench_id, |b| { let encrypt_values = || { (0..elements) - .map(|_| FheType::encrypt(rng.gen(), client_key)) + .map(|_| FheType::encrypt(rng.gen(), &client_key)) .collect::>() }; @@ -118,7 +150,7 @@ fn bench_sns_only_fhe_type( bench_group.bench_function(&bench_id, |b| { let encrypt_values = || { (0..elements) - .map(|_| FheType::encrypt(rng.gen(), client_key)) + .map(|_| FheType::encrypt(rng.gen(), &client_key)) .collect::>() }; @@ -150,7 +182,12 @@ fn bench_sns_only_fhe_type( fn bench_decomp_sns_comp_fhe_type( c: &mut Criterion, - client_key: &ClientKey, + params: ( + PBSParameters, + NoiseSquashingParameters, + NoiseSquashingCompressionParameters, + CompressionParameters, + ), type_name: &str, num_bits: usize, ) where @@ -158,11 +195,32 @@ fn bench_decomp_sns_comp_fhe_type( FheType: SquashNoise + Tagged + HlExpandable + HlCompressible, ::Output: HlSquashedNoiseCompressible, { + let (param, noise_param, comp_noise_param, comp_param) = params; + + use tfhe::{set_server_key, ConfigBuilder}; + let config = ConfigBuilder::with_custom_parameters(param) + .enable_noise_squashing(noise_param) + .enable_noise_squashing_compression(comp_noise_param) + .enable_compression(comp_param) + .build(); + let client_key = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&client_key); + + #[cfg(feature = "gpu")] + set_server_key(compressed_sks.decompress_to_gpu()); + + #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] + { + let decompressed_sks = compressed_sks.decompress(); + rayon::broadcast(|_| set_server_key(decompressed_sks.clone())); + set_server_key(decompressed_sks); + } + let mut bench_group = c.benchmark_group(type_name); let bench_id_prefix = if cfg!(feature = "gpu") { - "hlapi::cuda" + format!("hlapi::cuda::{}", noise_param.name()) } else { - "hlapi" + "hlapi".to_string() }; let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}"); @@ -175,9 +233,9 @@ fn bench_decomp_sns_comp_fhe_type( bench_id = format!("{bench_id_prefix}::{bench_id_suffix}"); #[cfg(feature = "gpu")] - configure_gpu(client_key); + configure_gpu(&client_key); - let input = FheType::encrypt(rng.gen(), client_key); + let input = FheType::encrypt(rng.gen(), &client_key); let mut builder = CompressedCiphertextListBuilder::new(); builder.push(input); @@ -205,7 +263,7 @@ fn bench_decomp_sns_comp_fhe_type( bench_group.throughput(Throughput::Elements(elements)); println!("elements: {elements}"); let gpu_count = get_number_of_gpus() as usize; - let compressed_server_key = CompressedServerKey::new(client_key); + let compressed_server_key = CompressedServerKey::new(&client_key); let sks_vec = (0..gpu_count) .map(|i| { compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32)) @@ -216,7 +274,7 @@ fn bench_decomp_sns_comp_fhe_type( let compressed_values = || { (0..elements) .map(|_| { - let input = FheType::encrypt(rng.gen(), client_key); + let input = FheType::encrypt(rng.gen(), &client_key); let mut builder = CompressedCiphertextListBuilder::new(); builder.push(input); builder.build().unwrap() @@ -254,7 +312,7 @@ fn bench_decomp_sns_comp_fhe_type( let compressed_values = || { (0..elements) .map(|_| { - let input = FheType::encrypt(rng.gen(), client_key); + let input = FheType::encrypt(rng.gen(), &client_key); let mut builder = CompressedCiphertextListBuilder::new(); builder.push(input); builder.build().unwrap() @@ -296,8 +354,10 @@ fn bench_decomp_sns_comp_fhe_type( macro_rules! bench_sns_only_type { ($fhe_type:ident) => { ::paste::paste! { - fn [](c: &mut Criterion, cks: &ClientKey) { - bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits()); + fn [](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) { + for param in params { + bench_sns_only_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits()); + } } } }; @@ -306,8 +366,10 @@ macro_rules! bench_sns_only_type { macro_rules! bench_decomp_sns_comp_type { ($fhe_type:ident) => { ::paste::paste! { - fn [](c: &mut Criterion, cks: &ClientKey) { - bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits()); + fn [](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) { + for param in params { + bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits()); + } } } }; @@ -330,65 +392,55 @@ bench_decomp_sns_comp_type!(FheUint64); fn main() { #[cfg(feature = "hpu")] panic!("Noise squashing is not supported on HPU"); - #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] - let cks = { - use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; - use tfhe::{set_server_key, ConfigBuilder}; - let config = ConfigBuilder::with_custom_parameters( - BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, - ) - .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128) - .enable_noise_squashing_compression( - BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, - ) - .enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128) - .build(); - let cks = ClientKey::generate(config); - let compressed_sks = CompressedServerKey::new(&cks); - let decompressed_sks = compressed_sks.decompress(); - rayon::broadcast(|_| set_server_key(decompressed_sks.clone())); - set_server_key(decompressed_sks); - cks - }; - #[cfg(feature = "gpu")] - let cks = { - use tfhe::{set_server_key, ConfigBuilder}; - let config = ConfigBuilder::with_custom_parameters( - BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, - ) - .enable_noise_squashing( - BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, - ) - .enable_noise_squashing_compression( - BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, - ) - .enable_compression( - BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, - ) - .build(); - let cks = ClientKey::generate(config); - let compressed_sks = CompressedServerKey::new(&cks); + let params: Vec<( + PBSParameters, + NoiseSquashingParameters, + NoiseSquashingCompressionParameters, + CompressionParameters, + )> = { + #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] + { + vec![( + BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(), + BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + )] + } - set_server_key(compressed_sks.decompress_to_gpu()); - cks + #[cfg(feature = "gpu")] + { + vec![( + BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(), + BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ), ( + BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(), + BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ), + ] + } }; let mut c = Criterion::default().configure_from_args(); - bench_sns_only_fhe_uint2(&mut c, &cks); - bench_sns_only_fhe_uint4(&mut c, &cks); - bench_sns_only_fhe_uint6(&mut c, &cks); - bench_sns_only_fhe_uint8(&mut c, &cks); - bench_sns_only_fhe_uint10(&mut c, &cks); - bench_sns_only_fhe_uint12(&mut c, &cks); - bench_sns_only_fhe_uint14(&mut c, &cks); - bench_sns_only_fhe_uint16(&mut c, &cks); - bench_sns_only_fhe_uint32(&mut c, &cks); - bench_sns_only_fhe_uint64(&mut c, &cks); - bench_sns_only_fhe_uint128(&mut c, &cks); + bench_sns_only_fhe_uint2(&mut c, params.as_slice()); + bench_sns_only_fhe_uint4(&mut c, params.as_slice()); + bench_sns_only_fhe_uint6(&mut c, params.as_slice()); + bench_sns_only_fhe_uint8(&mut c, params.as_slice()); + bench_sns_only_fhe_uint10(&mut c, params.as_slice()); + bench_sns_only_fhe_uint12(&mut c, params.as_slice()); + bench_sns_only_fhe_uint14(&mut c, params.as_slice()); + bench_sns_only_fhe_uint16(&mut c, params.as_slice()); + bench_sns_only_fhe_uint32(&mut c, params.as_slice()); + bench_sns_only_fhe_uint64(&mut c, params.as_slice()); + bench_sns_only_fhe_uint128(&mut c, params.as_slice()); - bench_decomp_sns_comp_fhe_uint64(&mut c, &cks); + bench_decomp_sns_comp_fhe_uint64(&mut c, params.as_slice()); c.final_summary(); } diff --git a/tfhe-benchmark/src/params_aliases.rs b/tfhe-benchmark/src/params_aliases.rs index cddfe454e..9b66c81c0 100644 --- a/tfhe-benchmark/src/params_aliases.rs +++ b/tfhe-benchmark/src/params_aliases.rs @@ -7,6 +7,7 @@ pub mod shortint_params_aliases { ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters, MultiBitPBSParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters, ShortintKeySwitchingParameters, + NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, }; // KS PBS Gaussian pub const BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters = @@ -143,6 +144,10 @@ pub mod shortint_params_aliases { NoiseSquashingParameters = V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: + NoiseSquashingParameters = + NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + pub const BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingCompressionParameters = V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs index 72d33eecf..aa695839a 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_multi_bit_programmable_bootstrapping_128.rs @@ -140,13 +140,8 @@ fn execute_multibit_bootstrap_u128( let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream); - let test_vector_indexes: Vec = vec![0; par_lwe_list.lwe_ciphertext_count().0]; - - let mut d_test_vector_indexes = - unsafe { CudaVec::::new_async(number_of_messages, &stream, 0) }; - unsafe { - d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) - }; + // We initialize it so cargo won't complain, but we don't use it internally + let d_test_vector_indexes = unsafe { CudaVec::::new_async(1, &stream, 0) }; let num_blocks = d_lwe_ciphertext_in.lwe_ciphertext_count().0; let lwe_indexes_usize: Vec = (0..num_blocks).collect_vec(); diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index a5c7e5ef1..2d64f17c7 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -399,7 +399,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async< lwe_array_out.as_mut_c_ptr(0), output_indexes.as_c_ptr(0), test_vector.as_c_ptr(0), - test_vector_indexes.as_c_ptr(0), lwe_array_in.as_c_ptr(0), input_indexes.as_c_ptr(0), bootstrapping_key.as_c_ptr(0), diff --git a/tfhe/src/high_level_api/tests/noise_squashing.rs b/tfhe/src/high_level_api/tests/noise_squashing.rs index b1f422a85..41b6e9a25 100644 --- a/tfhe/src/high_level_api/tests/noise_squashing.rs +++ b/tfhe/src/high_level_api/tests/noise_squashing.rs @@ -4,9 +4,12 @@ use crate::high_level_api::{ }; use crate::integer::U256; use crate::set_server_key; +#[cfg(feature = "gpu")] use crate::shortint::parameters::{ NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, +}; +use crate::shortint::parameters::{ NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, }; diff --git a/tfhe/src/integer/gpu/server_key/mod.rs b/tfhe/src/integer/gpu/server_key/mod.rs index 6de1f9b2f..e40a6380e 100644 --- a/tfhe/src/integer/gpu/server_key/mod.rs +++ b/tfhe/src/integer/gpu/server_key/mod.rs @@ -32,8 +32,8 @@ pub enum CudaBootstrappingKey { impl CudaBootstrappingKey { pub(crate) fn output_lwe_dimension(&self) -> LweDimension { match self { - CudaBootstrappingKey::Classic(bsk) => bsk.output_lwe_dimension(), - CudaBootstrappingKey::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(), + Self::Classic(bsk) => bsk.output_lwe_dimension(), + Self::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(), } } } diff --git a/tfhe/src/shortint/keycache.rs b/tfhe/src/shortint/keycache.rs index dc111066d..c7f58adef 100644 --- a/tfhe/src/shortint/keycache.rs +++ b/tfhe/src/shortint/keycache.rs @@ -487,7 +487,9 @@ fn cpke_params_default_name(params: &CompactPublicKeyEncryptionParameters) -> St } named_params_impl!( NoiseSquashingParameters => - V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, ); named_params_impl!( NoiseSquashingCompressionParameters =>