diff --git a/Makefile b/Makefile index f00eebb84..f60f9ad3b 100644 --- a/Makefile +++ b/Makefile @@ -1291,6 +1291,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok -- +.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation +bench_hlapi_noise_squash: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ + cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ + --bench hlapi-noise-squash \ + --features=shortint,integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) -- + # # Utility tools # diff --git a/tfhe/Cargo.toml b/tfhe/Cargo.toml index 52f86227c..995485582 100644 --- a/tfhe/Cargo.toml +++ b/tfhe/Cargo.toml @@ -252,6 +252,12 @@ path = "benches/high_level_api/erc20.rs" harness = false required-features = ["integer", "internal-keycache"] +[[bench]] +name = "hlapi-noise-squash" +path = "benches/high_level_api/noise_squash.rs" +harness = false +required-features = ["shortint", "integer", "internal-keycache"] + [[bench]] name = "keygen" path = "benches/keygen/bench.rs" diff --git a/tfhe/benches/high_level_api/noise_squash.rs b/tfhe/benches/high_level_api/noise_squash.rs new file mode 100644 index 000000000..68de8baee --- /dev/null +++ b/tfhe/benches/high_level_api/noise_squash.rs @@ -0,0 +1,156 @@ +#[path = "../utilities.rs"] +mod utilities; + +use criterion::{Criterion, Throughput}; +use rand::prelude::*; +use rand::thread_rng; +use rayon::prelude::*; +use tfhe::keycache::NamedParam; +use tfhe::prelude::*; + +use crate::utilities::{ + get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType, +}; +#[cfg(feature = "gpu")] +use tfhe::core_crypto::gpu::get_number_of_gpus; +use tfhe::shortint::parameters::v1_1::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +#[cfg(feature = "gpu")] +use tfhe::{set_server_key, GpuIndex}; +use tfhe::{ + ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16, + FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, +}; + +fn bench_fhe_type( + c: &mut Criterion, + client_key: &ClientKey, + type_name: &str, + num_bits: usize, +) where + FheType: FheEncrypt + Send + Sync, + FheType: SquashNoise, +{ + let mut bench_group = c.benchmark_group(type_name); + let bench_id_prefix = if cfg!(feature = "gpu") { + "hlapi::cuda" + } else { + "hlapi" + }; + let bench_id_suffix = format!("noise_squash::{type_name}"); + + let mut rng = thread_rng(); + + let bench_id; + + match get_bench_type() { + BenchmarkType::Latency => { + bench_id = format!("{bench_id_prefix}::{bench_id_suffix}"); + + let input = FheType::encrypt(rng.gen(), client_key); + + bench_group.bench_function(&bench_id, |b| { + b.iter(|| { + let _ = input.squash_noise(); + }) + }); + } + BenchmarkType::Throughput => { + bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}"); + let params = client_key.computation_parameters(); + let num_blocks = num_bits + .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize); + + let elements = throughput_num_threads(num_blocks, 1); + bench_group.throughput(Throughput::Elements(elements)); + println!("elements: {elements}"); + bench_group.bench_function(&bench_id, |b| { + let encrypt_values = || { + (0..elements) + .map(|_| FheType::encrypt(rng.gen(), client_key)) + .collect::>() + }; + + b.iter_batched( + encrypt_values, + |inputs| { + inputs.par_iter().for_each(|input| { + let _ = input.squash_noise(); + }) + }, + criterion::BatchSize::SmallInput, + ) + }); + } + } + let params = client_key.computation_parameters(); + + write_to_json::( + &bench_id, + params, + params.name(), + "noise_squash", + &OperatorType::Atomic, + 64, + vec![], + ); +} + +macro_rules! bench_type { + ($fhe_type:ident) => { + ::paste::paste! { + fn [](c: &mut Criterion, cks: &ClientKey) { + bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits()); + } + } + }; +} + +bench_type!(FheUint2); +bench_type!(FheUint4); +bench_type!(FheUint6); +bench_type!(FheUint8); +bench_type!(FheUint10); +bench_type!(FheUint12); +bench_type!(FheUint14); +bench_type!(FheUint16); +bench_type!(FheUint32); +bench_type!(FheUint64); +bench_type!(FheUint128); + +fn main() { + #[cfg(feature = "hpu")] + panic!("Noise squashing is not supported on HPU"); + let cks = { + use tfhe::{set_server_key, ConfigBuilder}; + let config = + ConfigBuilder::with_custom_parameters(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128) + .enable_noise_squashing( + V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ) + .build(); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + let decompressed_sks = compressed_sks.decompress(); + rayon::broadcast(|_| set_server_key(decompressed_sks.clone())); + set_server_key(decompressed_sks); + cks + }; + + let mut c = Criterion::default().configure_from_args(); + + // bench_fhe_uint2(&mut c, &cks); + // bench_fhe_uint4(&mut c, &cks); + // bench_fhe_uint6(&mut c, &cks); + // bench_fhe_uint8(&mut c, &cks); + // bench_fhe_uint10(&mut c, &cks); + // bench_fhe_uint12(&mut c, &cks); + // bench_fhe_uint14(&mut c, &cks); + // bench_fhe_uint16(&mut c, &cks); + // bench_fhe_uint32(&mut c, &cks); + bench_fhe_uint64(&mut c, &cks); + // bench_fhe_uint128(&mut c, &cks); + + c.final_summary(); +} diff --git a/tfhe/benches/utilities.rs b/tfhe/benches/utilities.rs index 53a7eb8f1..c54a11ba8 100644 --- a/tfhe/benches/utilities.rs +++ b/tfhe/benches/utilities.rs @@ -628,8 +628,8 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus(); let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading); let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64; - elements.min(1500) // This threshold is useful for operation with both a small number of - // block and low PBs count. + elements.min(200) // This threshold is useful for operation with both a small number of + // block and low PBs count. } #[cfg(not(feature = "gpu"))] { diff --git a/tfhe/src/shortint/parameters/v1_1/noise_squashing/p_fail_2_minus_128/mod.rs b/tfhe/src/shortint/parameters/v1_1/noise_squashing/p_fail_2_minus_128/mod.rs index 5950e1c61..0ccceedf1 100644 --- a/tfhe/src/shortint/parameters/v1_1/noise_squashing/p_fail_2_minus_128/mod.rs +++ b/tfhe/src/shortint/parameters/v1_1/noise_squashing/p_fail_2_minus_128/mod.rs @@ -22,3 +22,23 @@ pub const V1_1_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: carry_modulus: CarryModulus(4), ciphertext_modulus: CoreCiphertextModulus::::new_native(), }; + +pub const V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: +NoiseSquashingParameters = NoiseSquashingParameters { + glwe_dimension: GlweDimension(2), + polynomial_size: PolynomialSize(2048), + glwe_noise_distribution: DynamicDistribution::new_t_uniform(30), + decomp_base_log: DecompositionBaseLog(24), + decomp_level_count: DecompositionLevelCount(3), + modulus_switch_noise_reduction_params: Some( + ModulusSwitchNoiseReductionParams { + modulus_switch_zeros_count: LweCiphertextCount(1449), + ms_bound: NoiseEstimationMeasureBound(288230376151711744f64), + ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64), + ms_input_variance: Variance(2.63039184094559E-7f64), + }, + ), + message_modulus: MessageModulus(4), + carry_modulus: CarryModulus(4), + ciphertext_modulus: CoreCiphertextModulus::::new_native(), +};