From 4920db22b910915aca972022a6d50db84fae2a8a Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 29 Aug 2025 15:08:06 +0200 Subject: [PATCH] WIP: add cpu noise squash benchmarks latency and throughput on fheuint64 --- Makefile | 7 + tfhe-benchmark/Cargo.toml | 6 + .../benches/high_level_api/noise_squash.rs | 161 ++++++++++++++++++ tfhe-benchmark/src/params_aliases.rs | 4 + tfhe-benchmark/src/utilities.rs | 4 +- 5 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 tfhe-benchmark/benches/high_level_api/noise_squash.rs diff --git a/Makefile b/Makefile index ba49eb67d..7a49701a7 100644 --- a/Makefile +++ b/Makefile @@ -1467,6 +1467,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" \ cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok -- +.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation +bench_hlapi_noise_squash: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ + cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ + --bench hlapi-noise-squash \ + --features=shortint,integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark -- + # # Utility tools # diff --git a/tfhe-benchmark/Cargo.toml b/tfhe-benchmark/Cargo.toml index 1668a29f1..e601a7a7d 100644 --- a/tfhe-benchmark/Cargo.toml +++ b/tfhe-benchmark/Cargo.toml @@ -84,6 +84,12 @@ path = "benches/high_level_api/dex.rs" harness = false required-features = ["integer", "internal-keycache"] +[[bench]] +name = "hlapi-noise-squash" +path = "benches/high_level_api/noise_squash.rs" +harness = false +required-features = ["shortint", "integer", "internal-keycache"] + [[bench]] name = "glwe_packing_compression-integer-bench" path = "benches/integer/glwe_packing_compression.rs" diff --git a/tfhe-benchmark/benches/high_level_api/noise_squash.rs b/tfhe-benchmark/benches/high_level_api/noise_squash.rs new file mode 100644 index 000000000..1443839bd --- /dev/null +++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs @@ -0,0 +1,161 @@ +#[cfg(feature = "gpu")] +use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +#[cfg(feature = "gpu")] +use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; +#[cfg(feature = "gpu")] +use benchmark::utilities::configure_gpu; +use benchmark::utilities::{ + get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType, +}; +use criterion::{Criterion, Throughput}; +use rand::prelude::*; +use rand::thread_rng; +use rayon::prelude::*; +use tfhe::keycache::NamedParam; +use tfhe::prelude::*; + +#[cfg(feature = "gpu")] +use tfhe::core_crypto::gpu::get_number_of_gpus; +#[cfg(feature = "gpu")] +use tfhe::{set_server_key, GpuIndex}; +use tfhe::{ + ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16, + FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, +}; +use benchmark::params_aliases::*; + +fn bench_fhe_type( + c: &mut Criterion, + client_key: &ClientKey, + type_name: &str, + num_bits: usize, +) where + FheType: FheEncrypt + Send + Sync, + FheType: SquashNoise, +{ + let mut bench_group = c.benchmark_group(type_name); + let bench_id_prefix = if cfg!(feature = "gpu") { + "hlapi::cuda" + } else { + "hlapi" + }; + let bench_id_suffix = format!("noise_squash::{type_name}"); + + let mut rng = thread_rng(); + + let bench_id; + + match get_bench_type() { + BenchmarkType::Latency => { + bench_id = format!("{bench_id_prefix}::{bench_id_suffix}"); + + let input = FheType::encrypt(rng.gen(), client_key); + + bench_group.bench_function(&bench_id, |b| { + b.iter(|| { + let _ = input.squash_noise(); + }) + }); + } + BenchmarkType::Throughput => { + bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}"); + let params = client_key.computation_parameters(); + let num_blocks = num_bits + .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize); + + #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] + { + let elements = throughput_num_threads(num_blocks, 1); + bench_group.throughput(Throughput::Elements(elements)); + println!("elements: {elements}"); + bench_group.bench_function(&bench_id, |b| { + let encrypt_values = || { + (0..elements) + .map(|_| FheType::encrypt(rng.gen(), client_key)) + .collect::>() + }; + + b.iter_batched( + encrypt_values, + |inputs| { + inputs.par_iter().for_each(|input| { + let _ = input.squash_noise(); + }) + }, + criterion::BatchSize::SmallInput, + ) + }); + } + } + } + let params = client_key.computation_parameters(); + + write_to_json::( + &bench_id, + params, + params.name(), + "noise_squash", + &OperatorType::Atomic, + 64, + vec![], + ); +} + +macro_rules! bench_type { + ($fhe_type:ident) => { + ::paste::paste! { + fn [](c: &mut Criterion, cks: &ClientKey) { + bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits()); + } + } + }; +} + +bench_type!(FheUint2); +bench_type!(FheUint4); +bench_type!(FheUint6); +bench_type!(FheUint8); +bench_type!(FheUint10); +bench_type!(FheUint12); +bench_type!(FheUint14); +bench_type!(FheUint16); +bench_type!(FheUint32); +bench_type!(FheUint64); +bench_type!(FheUint128); + +fn main() { + #[cfg(feature = "hpu")] + panic!("Noise squashing is not supported on HPU"); + #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))] + let cks = { + use tfhe::{set_server_key, ConfigBuilder}; + let config = ConfigBuilder::with_custom_parameters( + BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ) + .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128) + .build(); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + let decompressed_sks = compressed_sks.decompress(); + rayon::broadcast(|_| set_server_key(decompressed_sks.clone())); + set_server_key(decompressed_sks); + cks + }; + + let mut c = Criterion::default().configure_from_args(); + + // bench_fhe_uint2(&mut c, &cks); + // bench_fhe_uint4(&mut c, &cks); + // bench_fhe_uint6(&mut c, &cks); + // bench_fhe_uint8(&mut c, &cks); + // bench_fhe_uint10(&mut c, &cks); + // bench_fhe_uint12(&mut c, &cks); + // bench_fhe_uint14(&mut c, &cks); + // bench_fhe_uint16(&mut c, &cks); + // bench_fhe_uint32(&mut c, &cks); + bench_fhe_uint64(&mut c, &cks); + // bench_fhe_uint128(&mut c, &cks); + + c.final_summary(); +} diff --git a/tfhe-benchmark/src/params_aliases.rs b/tfhe-benchmark/src/params_aliases.rs index 032f133e5..00c7bbd84 100644 --- a/tfhe-benchmark/src/params_aliases.rs +++ b/tfhe-benchmark/src/params_aliases.rs @@ -139,6 +139,10 @@ pub mod shortint_params_aliases { NoiseSquashingParameters = V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: + NoiseSquashingParameters = + V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + #[cfg(feature = "hpu")] // KS PBS Gaussian for Hpu pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters = diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs index 873740311..d2159fdd9 100644 --- a/tfhe-benchmark/src/utilities.rs +++ b/tfhe-benchmark/src/utilities.rs @@ -397,8 +397,8 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus(); let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading); let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64; - elements.min(1500) // This threshold is useful for operation with both a small number of - // block and low PBs count. + elements.min(200) // This threshold is useful for operation with both a small number of + // block and low PBs count. } #[cfg(feature = "hpu")] {