From 4920db22b910915aca972022a6d50db84fae2a8a Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Fri, 29 Aug 2025 15:08:06 +0200
Subject: [PATCH] WIP: add cpu noise squash benchmarks latency and throughput
 on fheuint64

---
 Makefile                                      |   7 +
 tfhe-benchmark/Cargo.toml                     |   6 +
 .../benches/high_level_api/noise_squash.rs    | 161 ++++++++++++++++++
 tfhe-benchmark/src/params_aliases.rs          |   4 +
 tfhe-benchmark/src/utilities.rs               |   4 +-
 5 files changed, 180 insertions(+), 2 deletions(-)
 create mode 100644 tfhe-benchmark/benches/high_level_api/noise_squash.rs
diff --git a/Makefile b/Makefile
index ba49eb67d..7a49701a7 100644
--- a/Makefile
+++ b/Makefile
@@ -1467,6 +1467,13 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --
 
+.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
+bench_hlapi_noise_squash: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=shortint,integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
 #
 # Utility tools
 #
diff --git a/tfhe-benchmark/Cargo.toml b/tfhe-benchmark/Cargo.toml
index 1668a29f1..e601a7a7d 100644
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -84,6 +84,12 @@ path = "benches/high_level_api/dex.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]
 
+[[bench]]
+name = "hlapi-noise-squash"
+path = "benches/high_level_api/noise_squash.rs"
+harness = false
+required-features = ["shortint", "integer", "internal-keycache"]
+
 [[bench]]
 name = "glwe_packing_compression-integer-bench"
 path = "benches/integer/glwe_packing_compression.rs"
diff --git a/tfhe-benchmark/benches/high_level_api/noise_squash.rs b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
new file mode 100644
index 000000000..1443839bd
--- /dev/null
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -0,0 +1,161 @@
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::utilities::configure_gpu;
+use benchmark::utilities::{
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
+};
+use criterion::{Criterion, Throughput};
+use rand::prelude::*;
+use rand::thread_rng;
+use rayon::prelude::*;
+use tfhe::keycache::NamedParam;
+use tfhe::prelude::*;
+
+#[cfg(feature = "gpu")]
+use tfhe::core_crypto::gpu::get_number_of_gpus;
+#[cfg(feature = "gpu")]
+use tfhe::{set_server_key, GpuIndex};
+use tfhe::{
+    ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
+    FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
+};
+use benchmark::params_aliases::*;
+
+fn bench_fhe_type<FheType>(
+    c: &mut Criterion,
+    client_key: &ClientKey,
+    type_name: &str,
+    num_bits: usize,
+) where
+    FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
+    FheType: SquashNoise,
+{
+    let mut bench_group = c.benchmark_group(type_name);
+    let bench_id_prefix = if cfg!(feature = "gpu") {
+        "hlapi::cuda"
+    } else {
+        "hlapi"
+    };
+    let bench_id_suffix = format!("noise_squash::{type_name}");
+
+    let mut rng = thread_rng();
+
+    let bench_id;
+
+    match get_bench_type() {
+        BenchmarkType::Latency => {
+            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
+
+            let input = FheType::encrypt(rng.gen(), client_key);
+
+            bench_group.bench_function(&bench_id, |b| {
+                b.iter(|| {
+                    let _ = input.squash_noise();
+                })
+            });
+        }
+        BenchmarkType::Throughput => {
+            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
+            let params = client_key.computation_parameters();
+            let num_blocks = num_bits
+                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
+
+            #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+            {
+                let elements = throughput_num_threads(num_blocks, 1);
+                bench_group.throughput(Throughput::Elements(elements));
+                println!("elements: {elements}");
+                bench_group.bench_function(&bench_id, |b| {
+                    let encrypt_values = || {
+                        (0..elements)
+                            .map(|_| FheType::encrypt(rng.gen(), client_key))
+                            .collect::<Vec<_>>()
+                    };
+
+                    b.iter_batched(
+                        encrypt_values,
+                        |inputs| {
+                            inputs.par_iter().for_each(|input| {
+                                let _ = input.squash_noise();
+                            })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+        }
+    }
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "noise_squash",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+macro_rules! bench_type {
+    ($fhe_type:ident) => {
+        ::paste::paste! {
+            fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            }
+        }
+    };
+}
+
+bench_type!(FheUint2);
+bench_type!(FheUint4);
+bench_type!(FheUint6);
+bench_type!(FheUint8);
+bench_type!(FheUint10);
+bench_type!(FheUint12);
+bench_type!(FheUint14);
+bench_type!(FheUint16);
+bench_type!(FheUint32);
+bench_type!(FheUint64);
+bench_type!(FheUint128);
+
+fn main() {
+    #[cfg(feature = "hpu")]
+    panic!("Noise squashing is not supported on HPU");
+    #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+    let cks = {
+        use tfhe::{set_server_key, ConfigBuilder};
+        let config = ConfigBuilder::with_custom_parameters(
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
+        .build();
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        let decompressed_sks = compressed_sks.decompress();
+        rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
+        set_server_key(decompressed_sks);
+        cks
+    };
+
+    let mut c = Criterion::default().configure_from_args();
+
+    // bench_fhe_uint2(&mut c, &cks);
+    // bench_fhe_uint4(&mut c, &cks);
+    // bench_fhe_uint6(&mut c, &cks);
+    // bench_fhe_uint8(&mut c, &cks);
+    // bench_fhe_uint10(&mut c, &cks);
+    // bench_fhe_uint12(&mut c, &cks);
+    // bench_fhe_uint14(&mut c, &cks);
+    // bench_fhe_uint16(&mut c, &cks);
+    // bench_fhe_uint32(&mut c, &cks);
+    bench_fhe_uint64(&mut c, &cks);
+    // bench_fhe_uint128(&mut c, &cks);
+
+    c.final_summary();
+}
diff --git a/tfhe-benchmark/src/params_aliases.rs b/tfhe-benchmark/src/params_aliases.rs
index 032f133e5..00c7bbd84 100644
--- a/tfhe-benchmark/src/params_aliases.rs
+++ b/tfhe-benchmark/src/params_aliases.rs
@@ -139,6 +139,10 @@ pub mod shortint_params_aliases {
         NoiseSquashingParameters =
         V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
 
+    pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        NoiseSquashingParameters =
+        V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
     #[cfg(feature = "hpu")]
     // KS PBS Gaussian for Hpu
     pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters =
diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs
index 873740311..d2159fdd9 100644
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -397,8 +397,8 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
         let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
         let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
         let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
-        elements.min(1500) // This threshold is useful for operation with both a small number of
-                           // block and low PBs count.
+        elements.min(200) // This threshold is useful for operation with both a small number of
+                          // block and low PBs count.
     }
     #[cfg(feature = "hpu")]
     {