WIP: add cpu noise squash benchmarks latency and throughput on fheuint64

This commit is contained in:
Agnes Leroy
2025-08-29 15:08:06 +02:00
committed by David Testé
parent 7483ed61f5
commit 76316dfb2a
5 changed files with 191 additions and 2 deletions

View File

@@ -1291,6 +1291,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
bench_hlapi_noise_squash: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=shortint,integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
#
# Utility tools
#

View File

@@ -252,6 +252,12 @@ path = "benches/high_level_api/erc20.rs"
harness = false
required-features = ["integer", "internal-keycache"]
[[bench]]
name = "hlapi-noise-squash"
path = "benches/high_level_api/noise_squash.rs"
harness = false
required-features = ["shortint", "integer", "internal-keycache"]
[[bench]]
name = "keygen"
path = "benches/keygen/bench.rs"

View File

@@ -0,0 +1,156 @@
#[path = "../utilities.rs"]
mod utilities;
use criterion::{Criterion, Throughput};
use rand::prelude::*;
use rand::thread_rng;
use rayon::prelude::*;
use tfhe::keycache::NamedParam;
use tfhe::prelude::*;
use crate::utilities::{
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
};
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::get_number_of_gpus;
use tfhe::shortint::parameters::v1_1::V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
#[cfg(feature = "gpu")]
use tfhe::{set_server_key, GpuIndex};
use tfhe::{
ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
};
fn bench_fhe_type<FheType>(
c: &mut Criterion,
client_key: &ClientKey,
type_name: &str,
num_bits: usize,
) where
FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
FheType: SquashNoise,
{
let mut bench_group = c.benchmark_group(type_name);
let bench_id_prefix = if cfg!(feature = "gpu") {
"hlapi::cuda"
} else {
"hlapi"
};
let bench_id_suffix = format!("noise_squash::{type_name}");
let mut rng = thread_rng();
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
let input = FheType::encrypt(rng.gen(), client_key);
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let _ = input.squash_noise();
})
});
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
let params = client_key.computation_parameters();
let num_blocks = num_bits
.div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
let elements = throughput_num_threads(num_blocks, 1);
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
bench_group.bench_function(&bench_id, |b| {
let encrypt_values = || {
(0..elements)
.map(|_| FheType::encrypt(rng.gen(), client_key))
.collect::<Vec<_>>()
};
b.iter_batched(
encrypt_values,
|inputs| {
inputs.par_iter().for_each(|input| {
let _ = input.squash_noise();
})
},
criterion::BatchSize::SmallInput,
)
});
}
}
let params = client_key.computation_parameters();
write_to_json::<u64, _>(
&bench_id,
params,
params.name(),
"noise_squash",
&OperatorType::Atomic,
64,
vec![],
);
}
macro_rules! bench_type {
($fhe_type:ident) => {
::paste::paste! {
fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
}
}
};
}
bench_type!(FheUint2);
bench_type!(FheUint4);
bench_type!(FheUint6);
bench_type!(FheUint8);
bench_type!(FheUint10);
bench_type!(FheUint12);
bench_type!(FheUint14);
bench_type!(FheUint16);
bench_type!(FheUint32);
bench_type!(FheUint64);
bench_type!(FheUint128);
fn main() {
#[cfg(feature = "hpu")]
panic!("Noise squashing is not supported on HPU");
let cks = {
use tfhe::{set_server_key, ConfigBuilder};
let config =
ConfigBuilder::with_custom_parameters(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
.enable_noise_squashing(
V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.build();
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);
let decompressed_sks = compressed_sks.decompress();
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
set_server_key(decompressed_sks);
cks
};
let mut c = Criterion::default().configure_from_args();
// bench_fhe_uint2(&mut c, &cks);
// bench_fhe_uint4(&mut c, &cks);
// bench_fhe_uint6(&mut c, &cks);
// bench_fhe_uint8(&mut c, &cks);
// bench_fhe_uint10(&mut c, &cks);
// bench_fhe_uint12(&mut c, &cks);
// bench_fhe_uint14(&mut c, &cks);
// bench_fhe_uint16(&mut c, &cks);
// bench_fhe_uint32(&mut c, &cks);
bench_fhe_uint64(&mut c, &cks);
// bench_fhe_uint128(&mut c, &cks);
c.final_summary();
}

View File

@@ -628,8 +628,8 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
elements.min(1500) // This threshold is useful for operation with both a small number of
// block and low PBs count.
elements.min(200) // This threshold is useful for operation with both a small number of
// block and low PBs count.
}
#[cfg(not(feature = "gpu"))]
{

View File

@@ -22,3 +22,23 @@ pub const V1_1_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
carry_modulus: CarryModulus(4),
ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
};
pub const V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingParameters = NoiseSquashingParameters {
glwe_dimension: GlweDimension(2),
polynomial_size: PolynomialSize(2048),
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
decomp_base_log: DecompositionBaseLog(24),
decomp_level_count: DecompositionLevelCount(3),
modulus_switch_noise_reduction_params: Some(
ModulusSwitchNoiseReductionParams {
modulus_switch_zeros_count: LweCiphertextCount(1449),
ms_bound: NoiseEstimationMeasureBound(288230376151711744f64),
ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64),
ms_input_variance: Variance(2.63039184094559E-7f64),
},
),
message_modulus: MessageModulus(4),
carry_modulus: CarryModulus(4),
ciphertext_modulus: CoreCiphertextModulus::<u128>::new_native(),
};