chore(bench): move benchmarks to their own crate

This is done to speed-up compilation duration by avoiding recompiling tfhe each time a modification is made in a benchmark file.
2026-01-08 22:28:01 -05:00 · 2025-05-06 15:08:24 +02:00
parent d197a2aa73
commit 67ec4a28c1
33 changed files with 1166 additions and 1776 deletions
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -0,0 +1,157 @@
+[package]
+name = "tfhe-benchmark"
+version = "0.1.0"
+edition = "2021"
+homepage = "https://zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
+repository = "https://github.com/zama-ai/tfhe-rs"
+license = "BSD-3-Clause-Clear"
+description = "tfhe-benchmark: Performances measurements facility for tfhe-rs."
+rust-version = "1.84"
+publish = false
+
+[lib]
+name = "benchmark"
+path = "src/lib.rs"
+
+[dependencies]
+bincode = "1.3.3"
+# clap has to be pinned as its minimum supported rust version
+# changes often between minor releases, which breaks our CI
+clap = { version = "=4.4.4", features = ["derive"] }
+criterion = "0.5.1"
+dyn-stack = { workspace = true, features = ["default"] }
+itertools = "0.14"
+serde = { version = "1.0", default-features = false }
+serde_json = "1.0.94"
+paste = "1.0.7"
+rand = { workspace = true }
+rayon = { workspace = true }
+tfhe = { path = "../tfhe" }
+tfhe-csprng = { path = "../tfhe-csprng" }
+
+[features]
+boolean = ["tfhe/boolean"]
+shortint = ["tfhe/shortint"]
+integer = ["shortint", "tfhe/integer"]
+gpu = ["tfhe/gpu"]
+internal-keycache = ["tfhe/internal-keycache"]
+nightly-avx512 = ["tfhe/nightly-avx512"]
+pbs-stats = ["tfhe/pbs-stats"]
+zk-pok = ["tfhe/zk-pok"]
+
+[[bench]]
+name = "boolean-bench"
+path = "benches/boolean/bench.rs"
+harness = false
+required-features = ["boolean", "internal-keycache"]
+
+[[bench]]
+name = "shortint-bench"
+path = "benches/shortint/bench.rs"
+harness = false
+required-features = ["shortint", "internal-keycache"]
+
+[[bench]]
+name = "oprf-shortint-bench"
+path = "benches/shortint/oprf.rs"
+harness = false
+required-features = ["shortint", "internal-keycache"]
+
+[[bench]]
+name = "glwe_packing_compression-shortint-bench"
+path = "benches/shortint/glwe_packing_compression.rs"
+harness = false
+required-features = ["shortint", "internal-keycache"]
+
+[[bench]]
+name = "hlapi"
+path = "benches/high_level_api/bench.rs"
+harness = false
+required-features = ["integer", "internal-keycache"]
+
+[[bench]]
+name = "hlapi-erc20"
+path = "benches/high_level_api/erc20.rs"
+harness = false
+required-features = ["integer", "internal-keycache"]
+
+[[bench]]
+name = "hlapi-dex"
+path = "benches/high_level_api/dex.rs"
+harness = false
+required-features = ["integer", "internal-keycache"]
+
+[[bench]]
+name = "glwe_packing_compression-integer-bench"
+path = "benches/integer/glwe_packing_compression.rs"
+harness = false
+required-features = ["integer", "pbs-stats", "internal-keycache"]
+
+[[bench]]
+name = "integer-bench"
+path = "benches/integer/bench.rs"
+harness = false
+required-features = ["integer", "pbs-stats", "internal-keycache"]
+
+[[bench]]
+name = "integer-signed-bench"
+path = "benches/integer/signed_bench.rs"
+harness = false
+required-features = ["integer", "pbs-stats", "internal-keycache"]
+
+[[bench]]
+name = "zk-pke-bench"
+path = "benches/integer/zk_pke.rs"
+harness = false
+required-features = ["integer", "zk-pok", "pbs-stats", "internal-keycache"]
+
+[[bench]]
+name = "ks-bench"
+path = "benches/core_crypto/ks_bench.rs"
+harness = false
+required-features = ["shortint", "internal-keycache"]
+
+[[bench]]
+name = "pbs-bench"
+path = "benches/core_crypto/pbs_bench.rs"
+harness = false
+required-features = ["boolean", "shortint", "internal-keycache"]
+
+[[bench]]
+name = "ks-pbs-bench"
+path = "benches/core_crypto/ks_pbs_bench.rs"
+harness = false
+required-features = ["shortint", "internal-keycache"]
+
+[[bench]]
+name = "modulus_switch_noise_reduction"
+path = "benches/core_crypto/modulus_switch_noise_reduction.rs"
+harness = false
+required-features = ["shortint"]
+
+[[bench]]
+name = "pbs128-bench"
+path = "benches/core_crypto/pbs128_bench.rs"
+harness = false
+required-features = ["shortint"]
+
+[[bin]]
+name = "boolean_key_sizes"
+path = "src/bin/boolean_key_sizes.rs"
+required-features = ["boolean", "internal-keycache"]
+
+[[bin]]
+name = "shortint_key_sizes"
+path = "src/bin/shortint_key_sizes.rs"
+required-features = ["shortint", "internal-keycache"]
+
+[[bin]]
+name = "hlapi_compact_pk_ct_sizes"
+path = "src/bin/hlapi_compact_pk_ct_sizes.rs"
+required-features = ["integer", "internal-keycache"]
+
+[[bin]]
+name = "wasm_benchmarks_parser"
+path = "src/bin/wasm_benchmarks_parser.rs"
+required-features = ["shortint", "internal-keycache"]
--- a/tfhe-benchmark/LICENSE
+++ b/tfhe-benchmark/LICENSE
@@ -0,0 +1,28 @@
+BSD 3-Clause Clear License
+
+Copyright © 2025 ZAMA.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or other
+materials provided with the distribution.
+
+3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
+or promote products derived from this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
+THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/tfhe-benchmark/benches/boolean/bench.rs
+++ b/tfhe-benchmark/benches/boolean/bench.rs
@@ -0,0 +1,108 @@
+use benchmark::utilities::{write_to_json, CryptoParametersRecord, OperatorType};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use tfhe::boolean::client_key::ClientKey;
+use tfhe::boolean::parameters::{
+    BooleanParameters, DEFAULT_PARAMETERS, DEFAULT_PARAMETERS_KS_PBS,
+    PARAMETERS_ERROR_PROB_2_POW_MINUS_165, PARAMETERS_ERROR_PROB_2_POW_MINUS_165_KS_PBS,
+    TFHE_LIB_PARAMETERS,
+};
+use tfhe::boolean::prelude::BinaryBooleanGates;
+use tfhe::boolean::server_key::ServerKey;
+
+criterion_group!(
+    gates_benches,
+    bench_default_parameters,
+    bench_default_parameters_ks_pbs,
+    bench_low_prob_parameters,
+    bench_low_prob_parameters_ks_pbs,
+    bench_tfhe_lib_parameters,
+);
+
+criterion_main!(gates_benches);
+
+/// Helper function to write boolean benchmarks parameters to disk in JSON format.
+pub fn write_to_json_boolean<T: Into<CryptoParametersRecord<u32>>>(
+    bench_id: &str,
+    params: T,
+    params_alias: impl Into<String>,
+    display_name: impl Into<String>,
+) {
+    write_to_json(
+        bench_id,
+        params,
+        params_alias,
+        display_name,
+        &OperatorType::Atomic,
+        1,
+        vec![1],
+    );
+}
+
+// Put all `bench_function` in one place
+// so the keygen is only run once per parameters saving time.
+fn benches(c: &mut Criterion, params: BooleanParameters, parameter_name: &str) {
+    let mut bench_group = c.benchmark_group("gates_benches");
+
+    let cks = ClientKey::new(&params);
+    let sks = ServerKey::new(&cks);
+
+    let ct1 = cks.encrypt(true);
+    let ct2 = cks.encrypt(false);
+    let ct3 = cks.encrypt(true);
+
+    let id = format!("AND::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.and(&ct1, &ct2))));
+    write_to_json_boolean(&id, params, parameter_name, "and");
+
+    let id = format!("NAND::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.nand(&ct1, &ct2))));
+    write_to_json_boolean(&id, params, parameter_name, "nand");
+
+    let id = format!("OR::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.or(&ct1, &ct2))));
+    write_to_json_boolean(&id, params, parameter_name, "or");
+
+    let id = format!("XOR::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.xor(&ct1, &ct2))));
+    write_to_json_boolean(&id, params, parameter_name, "xor");
+
+    let id = format!("XNOR::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.xnor(&ct1, &ct2))));
+    write_to_json_boolean(&id, params, parameter_name, "xnor");
+
+    let id = format!("NOT::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.not(&ct1))));
+    write_to_json_boolean(&id, params, parameter_name, "not");
+
+    let id = format!("MUX::{parameter_name}");
+    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.mux(&ct1, &ct2, &ct3))));
+    write_to_json_boolean(&id, params, parameter_name, "mux");
+}
+
+fn bench_default_parameters(c: &mut Criterion) {
+    benches(c, DEFAULT_PARAMETERS, "DEFAULT_PARAMETERS");
+}
+
+fn bench_default_parameters_ks_pbs(c: &mut Criterion) {
+    benches(c, DEFAULT_PARAMETERS_KS_PBS, "DEFAULT_PARAMETERS_KS_PBS");
+}
+
+fn bench_low_prob_parameters(c: &mut Criterion) {
+    benches(
+        c,
+        PARAMETERS_ERROR_PROB_2_POW_MINUS_165,
+        "PARAMETERS_ERROR_PROB_2_POW_MINUS_165",
+    );
+}
+
+fn bench_low_prob_parameters_ks_pbs(c: &mut Criterion) {
+    benches(
+        c,
+        PARAMETERS_ERROR_PROB_2_POW_MINUS_165_KS_PBS,
+        "PARAMETERS_ERROR_PROB_2_POW_MINUS_165_KS_PBS",
+    );
+}
+
+fn bench_tfhe_lib_parameters(c: &mut Criterion) {
+    benches(c, TFHE_LIB_PARAMETERS, " TFHE_LIB_PARAMETERS");
+}
--- a/tfhe-benchmark/benches/core_crypto/ks_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_bench.rs
@@ -0,0 +1,834 @@
+#[cfg(feature = "boolean")]
+use benchmark::params::benchmark_32bits_parameters;
+use benchmark::params::{
+    benchmark_compression_parameters, benchmark_parameters, multi_bit_benchmark_parameters,
+};
+use benchmark::utilities::{
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, CryptoParametersRecord,
+    OperatorType,
+};
+use criterion::{black_box, Criterion, Throughput};
+use rayon::prelude::*;
+use serde::Serialize;
+use std::env;
+use tfhe::core_crypto::prelude::*;
+
+// TODO Refactor KS, PBS and KS-PBS benchmarks into a single generic function.
+fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
+    criterion: &mut Criterion,
+    parameters: &[(String, CryptoParametersRecord<Scalar>)],
+) {
+    let bench_name = "core_crypto::keyswitch";
+    let mut bench_group = criterion.benchmark_group(bench_name);
+
+    // Create the PRNG
+    let mut seeder = new_seeder();
+    let seeder = seeder.as_mut();
+    let mut encryption_generator =
+        EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+    let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+    for (name, params) in parameters.iter() {
+        let lwe_dimension = params.lwe_dimension.unwrap();
+        let glwe_dimension = params.glwe_dimension.unwrap();
+        let polynomial_size = params.polynomial_size.unwrap();
+        let ks_decomp_base_log = params.ks_base_log.unwrap();
+        let ks_decomp_level_count = params.ks_level.unwrap();
+
+        let lwe_sk =
+            allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);
+
+        let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+            glwe_dimension,
+            polynomial_size,
+            &mut secret_generator,
+        );
+        let big_lwe_sk = glwe_sk.into_lwe_secret_key();
+        let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
+            &big_lwe_sk,
+            &lwe_sk,
+            ks_decomp_base_log,
+            ks_decomp_level_count,
+            params.lwe_noise_distribution.unwrap(),
+            params.ciphertext_modulus.unwrap(),
+            &mut encryption_generator,
+        );
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let ct = allocate_and_encrypt_new_lwe_ciphertext(
+                    &big_lwe_sk,
+                    Plaintext(Scalar::ONE),
+                    params.lwe_noise_distribution.unwrap(),
+                    params.ciphertext_modulus.unwrap(),
+                    &mut encryption_generator,
+                );
+
+                let mut output_ct = LweCiphertext::new(
+                    Scalar::ZERO,
+                    lwe_sk.lwe_dimension().to_lwe_size(),
+                    params.ciphertext_modulus.unwrap(),
+                );
+
+                bench_id = format!("{bench_name}::{name}");
+                {
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            keyswitch_lwe_ciphertext(&ksk_big_to_small, &ct, &mut output_ct);
+                            black_box(&mut output_ct);
+                        })
+                    });
+                }
+            }
+            BenchmarkType::Throughput => {
+                bench_id = format!("{bench_name}::throughput::{name}");
+                let blocks: usize = 1;
+                let elements = throughput_num_threads(blocks, 1); // FIXME This number of element do not staturate the target machine
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let input_cts = (0..elements)
+                            .map(|_| {
+                                allocate_and_encrypt_new_lwe_ciphertext(
+                                    &big_lwe_sk,
+                                    Plaintext(Scalar::ONE),
+                                    params.lwe_noise_distribution.unwrap(),
+                                    params.ciphertext_modulus.unwrap(),
+                                    &mut encryption_generator,
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let output_cts = (0..elements)
+                            .map(|_| {
+                                LweCiphertext::new(
+                                    Scalar::ZERO,
+                                    lwe_sk.lwe_dimension().to_lwe_size(),
+                                    params.ciphertext_modulus.unwrap(),
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        (input_cts, output_cts)
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(input_cts, mut output_cts)| {
+                            input_cts
+                                .par_iter()
+                                .zip(output_cts.par_iter_mut())
+                                .for_each(|(input_ct, output_ct)| {
+                                    keyswitch_lwe_ciphertext(
+                                        &ksk_big_to_small,
+                                        input_ct,
+                                        output_ct,
+                                    );
+                                })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+        };
+
+        let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
+        write_to_json(
+            &bench_id,
+            *params,
+            name,
+            "ks",
+            &OperatorType::Atomic,
+            bit_size,
+            vec![bit_size],
+        );
+    }
+}
+
+fn packing_keyswitch<Scalar, F>(
+    criterion: &mut Criterion,
+    bench_name: &str,
+    parameters: &[(String, CryptoParametersRecord<Scalar>)],
+    ks_op: F,
+) where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize,
+    F: Fn(
+            &LwePackingKeyswitchKey<Vec<Scalar>>,
+            &LweCiphertextList<Vec<Scalar>>,
+            &mut GlweCiphertext<Vec<Scalar>>,
+        ) + Sync
+        + Send,
+{
+    let bench_name = format!("core_crypto::{bench_name}");
+    let mut bench_group = criterion.benchmark_group(&bench_name);
+
+    // Create the PRNG
+    let mut seeder = new_seeder();
+    let seeder = seeder.as_mut();
+    let mut encryption_generator =
+        EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+    let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+    for (name, params) in parameters.iter() {
+        let lwe_dimension = params.lwe_dimension.unwrap();
+        let packing_glwe_dimension = params.packing_ks_glwe_dimension.unwrap();
+        let packing_polynomial_size = params.packing_ks_polynomial_size.unwrap();
+        let packing_ks_decomp_base_log = params.packing_ks_base_log.unwrap();
+        let packing_ks_decomp_level_count = params.packing_ks_level.unwrap();
+        let ciphertext_modulus = params.ciphertext_modulus.unwrap();
+        let count = params.lwe_per_glwe.unwrap();
+
+        let lwe_sk =
+            allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);
+
+        let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+            packing_glwe_dimension,
+            packing_polynomial_size,
+            &mut secret_generator,
+        );
+
+        let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
+            &lwe_sk,
+            &glwe_sk,
+            packing_ks_decomp_base_log,
+            packing_ks_decomp_level_count,
+            params.packing_ks_key_noise_distribution.unwrap(),
+            ciphertext_modulus,
+            &mut encryption_generator,
+        );
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let mut input_lwe_list = LweCiphertextList::new(
+                    Scalar::ZERO,
+                    lwe_sk.lwe_dimension().to_lwe_size(),
+                    count,
+                    ciphertext_modulus,
+                );
+
+                let plaintext_list = PlaintextList::new(
+                    Scalar::ZERO,
+                    PlaintextCount(input_lwe_list.lwe_ciphertext_count().0),
+                );
+
+                encrypt_lwe_ciphertext_list(
+                    &lwe_sk,
+                    &mut input_lwe_list,
+                    &plaintext_list,
+                    params.lwe_noise_distribution.unwrap(),
+                    &mut encryption_generator,
+                );
+
+                let mut output_glwe = GlweCiphertext::new(
+                    Scalar::ZERO,
+                    glwe_sk.glwe_dimension().to_glwe_size(),
+                    glwe_sk.polynomial_size(),
+                    ciphertext_modulus,
+                );
+
+                bench_id = format!("{bench_name}::{name}");
+                {
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            ks_op(&pksk, &input_lwe_list, &mut output_glwe);
+                            black_box(&mut output_glwe);
+                        })
+                    });
+                }
+            }
+            BenchmarkType::Throughput => {
+                bench_id = format!("{bench_name}::throughput::{name}");
+                let blocks: usize = 1;
+                let elements = throughput_num_threads(blocks, 1);
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let input_lwe_lists = (0..elements)
+                            .map(|_| {
+                                let mut input_lwe_list = LweCiphertextList::new(
+                                    Scalar::ZERO,
+                                    lwe_sk.lwe_dimension().to_lwe_size(),
+                                    count,
+                                    ciphertext_modulus,
+                                );
+
+                                let plaintext_list = PlaintextList::new(
+                                    Scalar::ZERO,
+                                    PlaintextCount(input_lwe_list.lwe_ciphertext_count().0),
+                                );
+
+                                encrypt_lwe_ciphertext_list(
+                                    &lwe_sk,
+                                    &mut input_lwe_list,
+                                    &plaintext_list,
+                                    params.lwe_noise_distribution.unwrap(),
+                                    &mut encryption_generator,
+                                );
+
+                                input_lwe_list
+                            })
+                            .collect::<Vec<_>>();
+
+                        let output_glwes = (0..elements)
+                            .map(|_| {
+                                GlweCiphertext::new(
+                                    Scalar::ZERO,
+                                    glwe_sk.glwe_dimension().to_glwe_size(),
+                                    glwe_sk.polynomial_size(),
+                                    ciphertext_modulus,
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        (input_lwe_lists, output_glwes)
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(input_lwe_lists, mut output_glwes)| {
+                            input_lwe_lists
+                                .par_iter()
+                                .zip(output_glwes.par_iter_mut())
+                                .for_each(|(input_lwe_list, output_glwe)| {
+                                    ks_op(&pksk, input_lwe_list, output_glwe);
+                                })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+        };
+
+        let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
+        write_to_json(
+            &bench_id,
+            *params,
+            name,
+            "packing_ks",
+            &OperatorType::Atomic,
+            bit_size,
+            vec![bit_size],
+        );
+    }
+}
+
+#[cfg(feature = "gpu")]
+mod cuda {
+    use benchmark::params::{benchmark_parameters, multi_bit_benchmark_parameters};
+    use benchmark::utilities::{
+        cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
+        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
+        CudaLocalKeys, OperatorType,
+    };
+    use criterion::{black_box, Criterion, Throughput};
+    use rayon::prelude::*;
+    use serde::Serialize;
+    use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+    use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+    use tfhe::core_crypto::gpu::{
+        cuda_keyswitch_lwe_ciphertext, cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64,
+        get_number_of_gpus, CudaStreams,
+    };
+    use tfhe::core_crypto::prelude::*;
+
+    fn cuda_keyswitch<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize>(
+        criterion: &mut Criterion,
+        parameters: &[(String, CryptoParametersRecord<Scalar>)],
+    ) {
+        let bench_name = "core_crypto::cuda::keyswitch";
+        let mut bench_group = criterion.benchmark_group(bench_name);
+
+        // Create the PRNG
+        let mut seeder = new_seeder();
+        let seeder = seeder.as_mut();
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        for (name, params) in parameters.iter() {
+            let lwe_dimension = params.lwe_dimension.unwrap();
+            let glwe_dimension = params.glwe_dimension.unwrap();
+            let polynomial_size = params.polynomial_size.unwrap();
+            let ks_decomp_base_log = params.ks_base_log.unwrap();
+            let ks_decomp_level_count = params.ks_level.unwrap();
+
+            let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
+                lwe_dimension,
+                &mut secret_generator,
+            );
+
+            let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+                glwe_dimension,
+                polynomial_size,
+                &mut secret_generator,
+            );
+            let big_lwe_sk = glwe_sk.into_lwe_secret_key();
+            let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
+                &big_lwe_sk,
+                &lwe_sk,
+                ks_decomp_base_log,
+                ks_decomp_level_count,
+                params.lwe_noise_distribution.unwrap(),
+                CiphertextModulus::new_native(),
+                &mut encryption_generator,
+            );
+
+            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+                .keyswitch_key(ksk_big_to_small)
+                .build();
+
+            let bench_id;
+
+            match get_bench_type() {
+                BenchmarkType::Latency => {
+                    let streams = CudaStreams::new_multi_gpu();
+                    let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
+
+                    let ct = allocate_and_encrypt_new_lwe_ciphertext(
+                        &big_lwe_sk,
+                        Plaintext(Scalar::ONE),
+                        params.lwe_noise_distribution.unwrap(),
+                        CiphertextModulus::new_native(),
+                        &mut encryption_generator,
+                    );
+                    let mut ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &streams);
+
+                    let output_ct = LweCiphertext::new(
+                        Scalar::ZERO,
+                        lwe_sk.lwe_dimension().to_lwe_size(),
+                        CiphertextModulus::new_native(),
+                    );
+                    let mut output_ct_gpu =
+                        CudaLweCiphertextList::from_lwe_ciphertext(&output_ct, &streams);
+
+                    let h_indexes = [Scalar::ZERO];
+                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
+
+                    bench_id = format!("{bench_name}::{name}");
+                    {
+                        bench_group.bench_function(&bench_id, |b| {
+                            b.iter(|| {
+                                cuda_keyswitch_lwe_ciphertext(
+                                    gpu_keys.ksk.as_ref().unwrap(),
+                                    &ct_gpu,
+                                    &mut output_ct_gpu,
+                                    &cuda_indexes.d_input,
+                                    &cuda_indexes.d_output,
+                                    &streams,
+                                );
+                                black_box(&mut ct_gpu);
+                            })
+                        });
+                    }
+                }
+                BenchmarkType::Throughput => {
+                    let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
+                    let gpu_count = get_number_of_gpus() as usize;
+
+                    bench_id = format!("{bench_name}::throughput::{name}");
+                    let blocks: usize = 1;
+                    let elements = throughput_num_threads(blocks, 1);
+                    let elements_per_stream = elements as usize / gpu_count;
+                    bench_group.throughput(Throughput::Elements(elements));
+                    bench_group.sample_size(50);
+                    bench_group.bench_function(&bench_id, |b| {
+                        let setup_encrypted_values = || {
+                            let local_streams = cuda_local_streams_core();
+
+                            let plaintext_list = PlaintextList::new(
+                                Scalar::ZERO,
+                                PlaintextCount(elements_per_stream),
+                            );
+
+                            let input_cts = (0..gpu_count)
+                                .map(|i| {
+                                    let mut input_ct_list = LweCiphertextList::new(
+                                        Scalar::ZERO,
+                                        big_lwe_sk.lwe_dimension().to_lwe_size(),
+                                        LweCiphertextCount(elements_per_stream),
+                                        params.ciphertext_modulus.unwrap(),
+                                    );
+                                    encrypt_lwe_ciphertext_list(
+                                        &big_lwe_sk,
+                                        &mut input_ct_list,
+                                        &plaintext_list,
+                                        params.lwe_noise_distribution.unwrap(),
+                                        &mut encryption_generator,
+                                    );
+                                    let input_ks_list = LweCiphertextList::from_container(
+                                        input_ct_list.into_container(),
+                                        big_lwe_sk.lwe_dimension().to_lwe_size(),
+                                        params.ciphertext_modulus.unwrap(),
+                                    );
+                                    CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                        &input_ks_list,
+                                        &local_streams[i],
+                                    )
+                                })
+                                .collect::<Vec<_>>();
+
+                            let output_cts = (0..gpu_count)
+                                .map(|i| {
+                                    let output_ct_list = LweCiphertextList::new(
+                                        Scalar::ZERO,
+                                        lwe_sk.lwe_dimension().to_lwe_size(),
+                                        LweCiphertextCount(elements_per_stream),
+                                        params.ciphertext_modulus.unwrap(),
+                                    );
+                                    CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                        &output_ct_list,
+                                        &local_streams[i],
+                                    )
+                                })
+                                .collect::<Vec<_>>();
+
+                            let h_indexes = (0..(elements / gpu_count as u64))
+                                .map(CastFrom::cast_from)
+                                .collect::<Vec<_>>();
+                            let cuda_indexes_vec = (0..gpu_count)
+                                .map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
+                                .collect::<Vec<_>>();
+                            local_streams.iter().for_each(|stream| stream.synchronize());
+
+                            (input_cts, output_cts, cuda_indexes_vec, local_streams)
+                        };
+
+                        b.iter_batched(
+                            setup_encrypted_values,
+                            |(input_cts, mut output_cts, cuda_indexes_vec, local_streams)| {
+                                (0..gpu_count)
+                                    .into_par_iter()
+                                    .zip(input_cts.par_iter())
+                                    .zip(output_cts.par_iter_mut())
+                                    .zip(local_streams.par_iter())
+                                    .for_each(|(((i, input_ct), output_ct), local_stream)| {
+                                        cuda_keyswitch_lwe_ciphertext(
+                                            gpu_keys_vec[i].ksk.as_ref().unwrap(),
+                                            input_ct,
+                                            output_ct,
+                                            &cuda_indexes_vec[i].d_input,
+                                            &cuda_indexes_vec[i].d_output,
+                                            local_stream,
+                                        );
+                                    })
+                            },
+                            criterion::BatchSize::SmallInput,
+                        )
+                    });
+                }
+            };
+
+            let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
+            write_to_json(
+                &bench_id,
+                *params,
+                name,
+                "ks",
+                &OperatorType::Atomic,
+                bit_size,
+                vec![bit_size],
+            );
+        }
+    }
+
+    fn cuda_packing_keyswitch<
+        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
+    >(
+        criterion: &mut Criterion,
+        parameters: &[(String, CryptoParametersRecord<Scalar>)],
+    ) {
+        let bench_name = "core_crypto::cuda::packing_keyswitch";
+        let mut bench_group = criterion.benchmark_group(bench_name);
+
+        // Create the PRNG
+        let mut seeder = new_seeder();
+        let seeder = seeder.as_mut();
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        for (name, params) in parameters.iter() {
+            let lwe_dimension = params.lwe_dimension.unwrap();
+            let glwe_dimension = params.glwe_dimension.unwrap();
+            let polynomial_size = params.polynomial_size.unwrap();
+            let ks_decomp_base_log = params.ks_base_log.unwrap();
+            let ks_decomp_level_count = params.ks_level.unwrap();
+            let glwe_noise_distribution = params.glwe_noise_distribution.unwrap();
+            let ciphertext_modulus = params.ciphertext_modulus.unwrap();
+
+            let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
+                lwe_dimension,
+                &mut secret_generator,
+            );
+
+            let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+                glwe_dimension,
+                polynomial_size,
+                &mut secret_generator,
+            );
+
+            let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
+                &lwe_sk,
+                &glwe_sk,
+                ks_decomp_base_log,
+                ks_decomp_level_count,
+                glwe_noise_distribution,
+                ciphertext_modulus,
+                &mut encryption_generator,
+            );
+
+            let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
+
+            let bench_id;
+
+            match get_bench_type() {
+                BenchmarkType::Latency => {
+                    let streams = CudaStreams::new_multi_gpu();
+                    let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
+
+                    let mut input_ct_list = LweCiphertextList::new(
+                        Scalar::ZERO,
+                        lwe_sk.lwe_dimension().to_lwe_size(),
+                        LweCiphertextCount(glwe_sk.polynomial_size().0),
+                        ciphertext_modulus,
+                    );
+
+                    let plaintext_list = PlaintextList::new(
+                        Scalar::ZERO,
+                        PlaintextCount(input_ct_list.lwe_ciphertext_count().0),
+                    );
+
+                    encrypt_lwe_ciphertext_list(
+                        &lwe_sk,
+                        &mut input_ct_list,
+                        &plaintext_list,
+                        params.lwe_noise_distribution.unwrap(),
+                        &mut encryption_generator,
+                    );
+
+                    let mut d_input_lwe_list =
+                        CudaLweCiphertextList::from_lwe_ciphertext_list(&input_ct_list, &streams);
+
+                    let mut d_output_glwe = CudaGlweCiphertextList::new(
+                        glwe_sk.glwe_dimension(),
+                        glwe_sk.polynomial_size(),
+                        GlweCiphertextCount(1),
+                        ciphertext_modulus,
+                        &streams,
+                    );
+
+                    streams.synchronize();
+
+                    bench_id = format!("{bench_name}::{name}");
+                    {
+                        bench_group.bench_function(&bench_id, |b| {
+                            b.iter(|| {
+                                cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
+                                    gpu_keys.pksk.as_ref().unwrap(),
+                                    &d_input_lwe_list,
+                                    &mut d_output_glwe,
+                                    &streams,
+                                );
+                                black_box(&mut d_input_lwe_list);
+                            })
+                        });
+                    }
+                }
+                BenchmarkType::Throughput => {
+                    let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
+                    let gpu_count = get_number_of_gpus() as usize;
+
+                    bench_id = format!("{bench_name}::throughput::{name}");
+                    let blocks: usize = 1;
+                    let elements = throughput_num_threads(blocks, 1);
+                    let elements_per_stream = elements as usize / gpu_count;
+                    bench_group.throughput(Throughput::Elements(elements));
+                    bench_group.sample_size(50);
+                    bench_group.bench_function(&bench_id, |b| {
+                        let setup_encrypted_values = || {
+                            let local_streams = cuda_local_streams_core();
+
+                            let plaintext_list = PlaintextList::new(
+                                Scalar::ZERO,
+                                PlaintextCount(elements_per_stream),
+                            );
+
+                            let input_lwe_lists = (0..gpu_count)
+                                .map(|i| {
+                                    let mut input_ct_list = LweCiphertextList::new(
+                                        Scalar::ZERO,
+                                        lwe_sk.lwe_dimension().to_lwe_size(),
+                                        LweCiphertextCount(glwe_sk.polynomial_size().0),
+                                        ciphertext_modulus,
+                                    );
+                                    encrypt_lwe_ciphertext_list(
+                                        &lwe_sk,
+                                        &mut input_ct_list,
+                                        &plaintext_list,
+                                        params.lwe_noise_distribution.unwrap(),
+                                        &mut encryption_generator,
+                                    );
+
+                                    CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                        &input_ct_list,
+                                        &local_streams[i],
+                                    )
+                                })
+                                .collect::<Vec<_>>();
+
+                            let output_glwe_list = (0..gpu_count)
+                                .map(|i| {
+                                    CudaGlweCiphertextList::new(
+                                        glwe_sk.glwe_dimension(),
+                                        glwe_sk.polynomial_size(),
+                                        GlweCiphertextCount(1),
+                                        ciphertext_modulus,
+                                        &local_streams[i],
+                                    )
+                                })
+                                .collect::<Vec<_>>();
+
+                            local_streams.iter().for_each(|stream| stream.synchronize());
+
+                            (input_lwe_lists, output_glwe_list, local_streams)
+                        };
+
+                        b.iter_batched(
+                            setup_encrypted_values,
+                            |(input_lwe_lists, mut output_glwe_lists, local_streams)| {
+                                (0..gpu_count)
+                                    .into_par_iter()
+                                    .zip(input_lwe_lists.par_iter())
+                                    .zip(output_glwe_lists.par_iter_mut())
+                                    .zip(local_streams.par_iter())
+                                    .for_each(
+                                        |(
+                                            ((i, input_lwe_list), output_glwe_list),
+                                            local_stream,
+                                        )| {
+                                            cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
+                                                gpu_keys_vec[i].pksk.as_ref().unwrap(),
+                                                input_lwe_list,
+                                                output_glwe_list,
+                                                local_stream,
+                                            );
+                                        },
+                                    )
+                            },
+                            criterion::BatchSize::SmallInput,
+                        )
+                    });
+                }
+            };
+
+            let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
+            write_to_json(
+                &bench_id,
+                *params,
+                name,
+                "packing_ks",
+                &OperatorType::Atomic,
+                bit_size,
+                vec![bit_size],
+            );
+        }
+    }
+
+    pub fn cuda_ks_group() {
+        let mut criterion: Criterion<_> =
+            (Criterion::default().sample_size(2000)).configure_from_args();
+        cuda_keyswitch(&mut criterion, &benchmark_parameters());
+        cuda_packing_keyswitch(&mut criterion, &benchmark_parameters());
+    }
+
+    pub fn cuda_multi_bit_ks_group() {
+        let mut criterion: Criterion<_> =
+            (Criterion::default().sample_size(2000)).configure_from_args();
+        cuda_keyswitch(&mut criterion, &multi_bit_benchmark_parameters());
+        cuda_packing_keyswitch(&mut criterion, &multi_bit_benchmark_parameters());
+    }
+}
+
+#[cfg(feature = "gpu")]
+use cuda::{cuda_ks_group, cuda_multi_bit_ks_group};
+
+pub fn ks_group() {
+    let mut criterion: Criterion<_> = (Criterion::default()
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60)))
+    .configure_from_args();
+    keyswitch(&mut criterion, &benchmark_parameters());
+    #[cfg(feature = "boolean")]
+    keyswitch(&mut criterion, &benchmark_32bits_parameters());
+}
+
+pub fn multi_bit_ks_group() {
+    let mut criterion: Criterion<_> = (Criterion::default()
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60)))
+    .configure_from_args();
+    keyswitch(&mut criterion, &multi_bit_benchmark_parameters());
+}
+
+pub fn packing_ks_group() {
+    let mut criterion: Criterion<_> = (Criterion::default()
+        .sample_size(10)
+        .measurement_time(std::time::Duration::from_secs(30)))
+    .configure_from_args();
+    packing_keyswitch(
+        &mut criterion,
+        "packing_keyswitch",
+        &benchmark_compression_parameters(),
+        keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext,
+    );
+    packing_keyswitch(
+        &mut criterion,
+        "par_packing_keyswitch",
+        &benchmark_compression_parameters(),
+        par_keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext,
+    );
+}
+
+#[cfg(feature = "gpu")]
+fn go_through_gpu_bench_groups(val: &str) {
+    match val.to_lowercase().as_str() {
+        "classical" => cuda_ks_group(),
+        "multi_bit" => cuda_multi_bit_ks_group(),
+        _ => panic!("unknown benchmark operations flavor"),
+    };
+}
+
+#[cfg(not(feature = "gpu"))]
+fn go_through_cpu_bench_groups(val: &str) {
+    match val.to_lowercase().as_str() {
+        "classical" => {
+            ks_group();
+            packing_ks_group()
+        }
+        "multi_bit" => multi_bit_ks_group(),
+        _ => panic!("unknown benchmark operations flavor"),
+    }
+}
+
+fn main() {
+    match env::var("__TFHE_RS_PARAM_TYPE") {
+        Ok(val) => {
+            #[cfg(feature = "gpu")]
+            go_through_gpu_bench_groups(&val);
+            #[cfg(not(feature = "gpu"))]
+            go_through_cpu_bench_groups(&val);
+        }
+        Err(_) => {
+            ks_group();
+            packing_ks_group()
+        }
+    };
+
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
--- a/tfhe-benchmark/benches/core_crypto/modulus_switch_noise_reduction.rs
+++ b/tfhe-benchmark/benches/core_crypto/modulus_switch_noise_reduction.rs
@@ -0,0 +1,89 @@
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use modulus_switch_noise_reduction::improve_lwe_ciphertext_modulus_switch_noise_for_binary_key;
+use tfhe::core_crypto::commons::parameters::{NoiseEstimationMeasureBound, RSigmaFactor};
+use tfhe::core_crypto::prelude::*;
+
+fn modulus_switch_noise_reduction(c: &mut Criterion) {
+    // TODO: use shortint params
+    let lwe_dimension = LweDimension(918);
+    let noise_distribution = DynamicDistribution::new_t_uniform(46);
+    let ciphertext_modulus = CiphertextModulus::new_native();
+    let bound = NoiseEstimationMeasureBound((1_u64 << (64 - 1 - 4 - 1)) as f64);
+    let r_sigma_factor = RSigmaFactor(14.658999256586121);
+    let log_modulus = PolynomialSize(2048).to_blind_rotation_input_modulus_log();
+    let input_variance = Variance(0.);
+
+    for count in [10, 50, 100, 1_000, 10_000, 100_000] {
+        let mut boxed_seeder = new_seeder();
+        let seeder = boxed_seeder.as_mut();
+
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+
+        let sk =
+            allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);
+
+        let clean_lwe = allocate_and_encrypt_new_lwe_ciphertext(
+            &sk,
+            Plaintext(0),
+            noise_distribution,
+            ciphertext_modulus,
+            &mut encryption_generator,
+        );
+
+        let mut encryptions_of_zero = LweCiphertextList::new(
+            0,
+            lwe_dimension.to_lwe_size(),
+            LweCiphertextCount(count),
+            ciphertext_modulus,
+        );
+
+        let plaintext_list = PlaintextList::new(0, PlaintextCount(count));
+
+        encrypt_lwe_ciphertext_list(
+            &sk,
+            &mut encryptions_of_zero,
+            &plaintext_list,
+            noise_distribution,
+            &mut encryption_generator,
+        );
+
+        let mut lwe =
+            LweCiphertext::new(0_u64, sk.lwe_dimension().to_lwe_size(), ciphertext_modulus);
+
+        let bench_name = "modulus_switch_noise_reduction";
+
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(5));
+
+        let bench_name = format!("modulus_switch_noise_reduction_{count}");
+
+        bench_group.bench_function(&bench_name, |b| {
+            b.iter(|| {
+                lwe.as_mut().copy_from_slice(clean_lwe.as_ref());
+
+                improve_lwe_ciphertext_modulus_switch_noise_for_binary_key(
+                    &mut lwe,
+                    &encryptions_of_zero,
+                    r_sigma_factor,
+                    bound,
+                    input_variance,
+                    log_modulus,
+                );
+
+                black_box(&lwe);
+            });
+        });
+    }
+}
+
+criterion_group!(
+    modulus_switch_noise_reduction2,
+    modulus_switch_noise_reduction
+);
+criterion_main!(modulus_switch_noise_reduction2);
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -0,0 +1,484 @@
+use benchmark::params_aliases::{
+    BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+};
+use benchmark::utilities::{write_to_json, CryptoParametersRecord, OperatorType};
+use criterion::{black_box, Criterion};
+use dyn_stack::PodStack;
+use tfhe::core_crypto::fft_impl::fft128::crypto::bootstrap::bootstrap_scratch;
+use tfhe::core_crypto::prelude::*;
+use tfhe::keycache::NamedParam;
+
+fn pbs_128(c: &mut Criterion) {
+    let bench_name = "core_crypto::pbs128";
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(10)
+        .measurement_time(std::time::Duration::from_secs(30));
+
+    type InputScalar = u64;
+    type OutputScalar = u128;
+
+    let noise_params = BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let base_params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let lwe_dimension = base_params.lwe_dimension; // From PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+    let glwe_dimension = noise_params.glwe_dimension;
+    let polynomial_size = noise_params.polynomial_size;
+    let lwe_noise_distribution = base_params.lwe_noise_distribution;
+    let glwe_noise_distribution = noise_params.glwe_noise_distribution;
+    let pbs_base_log = noise_params.decomp_base_log;
+    let pbs_level = noise_params.decomp_level_count;
+    let input_ciphertext_modulus = base_params.ciphertext_modulus;
+    let output_ciphertext_modulus = noise_params.ciphertext_modulus;
+
+    let mut boxed_seeder = new_seeder();
+    let seeder = boxed_seeder.as_mut();
+
+    let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+    let mut encryption_generator =
+        EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+
+    let input_lwe_secret_key =
+        LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_generator);
+
+    let output_glwe_secret_key = GlweSecretKey::<Vec<OutputScalar>>::generate_new_binary(
+        glwe_dimension,
+        polynomial_size,
+        &mut secret_generator,
+    );
+
+    let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+
+    let mut bsk = LweBootstrapKey::new(
+        OutputScalar::ZERO,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        pbs_base_log,
+        pbs_level,
+        lwe_dimension,
+        output_ciphertext_modulus,
+    );
+    par_generate_lwe_bootstrap_key(
+        &input_lwe_secret_key,
+        &output_glwe_secret_key,
+        &mut bsk,
+        glwe_noise_distribution,
+        &mut encryption_generator,
+    );
+
+    let mut fourier_bsk = Fourier128LweBootstrapKey::new(
+        lwe_dimension,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        pbs_base_log,
+        pbs_level,
+    );
+    convert_standard_lwe_bootstrap_key_to_fourier_128(&bsk, &mut fourier_bsk);
+
+    let message_modulus: InputScalar = 1 << 4;
+
+    let input_message: InputScalar = 3;
+
+    let delta: InputScalar = (1 << (InputScalar::BITS - 1)) / message_modulus;
+
+    let plaintext = Plaintext(input_message * delta);
+
+    let lwe_ciphertext_in: LweCiphertextOwned<InputScalar> =
+        allocate_and_encrypt_new_lwe_ciphertext(
+            &input_lwe_secret_key,
+            plaintext,
+            lwe_noise_distribution,
+            input_ciphertext_modulus,
+            &mut encryption_generator,
+        );
+
+    let accumulator: GlweCiphertextOwned<OutputScalar> = GlweCiphertextOwned::new(
+        OutputScalar::ONE,
+        glwe_dimension.to_glwe_size(),
+        polynomial_size,
+        output_ciphertext_modulus,
+    );
+
+    let mut out_pbs_ct: LweCiphertext<Vec<OutputScalar>> = LweCiphertext::new(
+        OutputScalar::ZERO,
+        output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+        output_ciphertext_modulus,
+    );
+
+    let fft = Fft128::new(polynomial_size);
+    let fft = fft.as_view();
+
+    let mut buffers = vec![
+        0u8;
+        bootstrap_scratch::<OutputScalar>(
+            fourier_bsk.glwe_size(),
+            fourier_bsk.polynomial_size(),
+            fft
+        )
+        .unwrap()
+        .unaligned_bytes_required()
+    ];
+
+    let id = format!("{bench_name}::{}", noise_params.name());
+    bench_group.bench_function(&id, |b| {
+        b.iter(|| {
+            fourier_bsk.bootstrap(
+                &mut out_pbs_ct,
+                &lwe_ciphertext_in,
+                &accumulator,
+                fft,
+                PodStack::new(&mut buffers),
+            );
+            black_box(&mut out_pbs_ct);
+        });
+    });
+
+    // TODO Add throughput benchmark case
+
+    let params_record = CryptoParametersRecord {
+        lwe_dimension: Some(lwe_dimension),
+        glwe_dimension: Some(glwe_dimension),
+        polynomial_size: Some(polynomial_size),
+        lwe_noise_distribution: Some(lwe_noise_distribution),
+        glwe_noise_distribution: Some(base_params.glwe_noise_distribution),
+        pbs_base_log: Some(pbs_base_log),
+        pbs_level: Some(pbs_level),
+        ciphertext_modulus: Some(input_ciphertext_modulus),
+        ..Default::default()
+    };
+
+    let bit_size = (message_modulus as u32).ilog2();
+    write_to_json(
+        &id,
+        params_record,
+        noise_params.name(),
+        "pbs",
+        &OperatorType::Atomic,
+        bit_size,
+        vec![bit_size],
+    );
+}
+
+#[cfg(feature = "gpu")]
+mod cuda {
+    use benchmark::utilities::{
+        cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
+        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord,
+        CudaLocalKeys, OperatorType,
+    };
+    use criterion::{black_box, Criterion, Throughput};
+    use rayon::prelude::*;
+    use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+    use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+    use tfhe::core_crypto::gpu::{
+        cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams,
+    };
+    use tfhe::core_crypto::prelude::*;
+    use tfhe::shortint::engine::ShortintEngine;
+    use tfhe::shortint::parameters::ModulusSwitchNoiseReductionParams;
+    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
+
+    fn cuda_pbs_128(c: &mut Criterion) {
+        let bench_name = "core_crypto::cuda::pbs128";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(10)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        type Scalar = u128;
+
+        let lwe_dimension = LweDimension(879);
+        let glwe_dimension = GlweDimension(2);
+        let polynomial_size = PolynomialSize(2048);
+        let lwe_noise_distribution = DynamicDistribution::new_t_uniform(46);
+        let lwe_noise_distribution_u128: DynamicDistribution<u128> =
+            DynamicDistribution::new_t_uniform(46);
+        let glwe_noise_distribution = DynamicDistribution::new_t_uniform(30);
+        let pbs_base_log = DecompositionBaseLog(24);
+        let pbs_level = DecompositionLevelCount(3);
+        let ciphertext_modulus = CiphertextModulus::new_native();
+        let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
+
+        let modulus_switch_noise_reduction_params = ModulusSwitchNoiseReductionParams {
+            modulus_switch_zeros_count: LweCiphertextCount(1449),
+            ms_bound: NoiseEstimationMeasureBound(288230376151711744f64),
+            ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64),
+            ms_input_variance: Variance(2.63039184094559E-7f64),
+        };
+
+        let params_name = "PARAMS_SWITCH_SQUASH";
+
+        let mut boxed_seeder = new_seeder();
+        let seeder = boxed_seeder.as_mut();
+
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+
+        let input_lwe_secret_key =
+            LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_generator);
+
+        let input_lwe_secret_key_u128 = LweSecretKey::from_container(
+            input_lwe_secret_key
+                .as_ref()
+                .iter()
+                .copied()
+                .map(|x| x as u128)
+                .collect::<Vec<_>>(),
+        );
+
+        let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
+            glwe_dimension,
+            polynomial_size,
+            &mut secret_generator,
+        );
+
+        let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+
+        let bsk = LweBootstrapKey::new(
+            Scalar::ZERO,
+            glwe_dimension.to_glwe_size(),
+            polynomial_size,
+            pbs_base_log,
+            pbs_level,
+            lwe_dimension,
+            ciphertext_modulus,
+        );
+
+        let mut engine = ShortintEngine::new();
+
+        let modulus_switch_noise_reduction_key = Some(ModulusSwitchNoiseReductionKey::new(
+            modulus_switch_noise_reduction_params,
+            &input_lwe_secret_key,
+            &mut engine,
+            CiphertextModulus::new_native(),
+            lwe_noise_distribution,
+        ));
+
+        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
+
+        let message_modulus: Scalar = 1 << 4;
+        let input_message: Scalar = 3;
+        let delta: Scalar = (1 << (Scalar::BITS - 1)) / message_modulus;
+        let plaintext = Plaintext(input_message * delta);
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let streams = CudaStreams::new_multi_gpu();
+                let gpu_keys = CudaLocalKeys::from_cpu_keys(
+                    &cpu_keys,
+                    modulus_switch_noise_reduction_key.as_ref(),
+                    &streams,
+                );
+
+                let lwe_ciphertext_in: LweCiphertextOwned<Scalar> =
+                    allocate_and_encrypt_new_lwe_ciphertext(
+                        &input_lwe_secret_key_u128,
+                        plaintext,
+                        lwe_noise_distribution_u128,
+                        ciphertext_modulus,
+                        &mut encryption_generator,
+                    );
+                let lwe_ciphertext_in_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams);
+
+                let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
+                    Scalar::ONE,
+                    glwe_dimension.to_glwe_size(),
+                    polynomial_size,
+                    ciphertext_modulus,
+                );
+                let accumulator_gpu =
+                    CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
+
+                let out_pbs_ct = LweCiphertext::new(
+                    Scalar::ZERO,
+                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                    ciphertext_modulus,
+                );
+                let mut out_pbs_ct_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
+
+                bench_id = format!("{bench_name}::{params_name}");
+                {
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            cuda_programmable_bootstrap_128_lwe_ciphertext(
+                                &lwe_ciphertext_in_gpu,
+                                &mut out_pbs_ct_gpu,
+                                &accumulator_gpu,
+                                LweCiphertextCount(1),
+                                gpu_keys.bsk.as_ref().unwrap(),
+                                &streams,
+                            );
+                            black_box(&mut out_pbs_ct_gpu);
+                        })
+                    });
+                }
+            }
+            BenchmarkType::Throughput => {
+                let gpu_keys_vec =
+                    cuda_local_keys_core(&cpu_keys, modulus_switch_noise_reduction_key.as_ref());
+                let gpu_count = get_number_of_gpus() as usize;
+
+                bench_id = format!("{bench_name}::throughput::{params_name}");
+                let blocks: usize = 1;
+                let elements = throughput_num_threads(blocks, 1);
+                let elements_per_stream = elements as usize / gpu_count;
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let local_streams = cuda_local_streams_core();
+
+                        let plaintext_list =
+                            PlaintextList::new(Scalar::ZERO, PlaintextCount(elements_per_stream));
+
+                        let input_cts = (0..gpu_count)
+                            .map(|i| {
+                                let mut input_ct_list = LweCiphertextList::new(
+                                    Scalar::ZERO,
+                                    input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    ciphertext_modulus,
+                                );
+
+                                encrypt_lwe_ciphertext_list(
+                                    &input_lwe_secret_key_u128,
+                                    &mut input_ct_list,
+                                    &plaintext_list,
+                                    lwe_noise_distribution_u128,
+                                    &mut encryption_generator,
+                                );
+
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &input_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let accumulators = (0..gpu_count)
+                            .map(|i| {
+                                let accumulator = GlweCiphertextOwned::new(
+                                    Scalar::ONE,
+                                    glwe_dimension.to_glwe_size(),
+                                    polynomial_size,
+                                    ciphertext_modulus,
+                                );
+                                CudaGlweCiphertextList::from_glwe_ciphertext(
+                                    &accumulator,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        // Allocate the LweCiphertext to store the result of the PBS
+                        let output_cts = (0..gpu_count)
+                            .map(|i| {
+                                let output_ct_list = LweCiphertextList::new(
+                                    Scalar::ZERO,
+                                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    ciphertext_modulus,
+                                );
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &output_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        local_streams.iter().for_each(|stream| stream.synchronize());
+
+                        (input_cts, output_cts, accumulators, local_streams)
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(input_cts, mut output_cts, accumulators, local_streams)| {
+                            (0..gpu_count)
+                                .into_par_iter()
+                                .zip(input_cts.par_iter())
+                                .zip(output_cts.par_iter_mut())
+                                .zip(accumulators.par_iter())
+                                .zip(local_streams.par_iter())
+                                .for_each(
+                                    |((((i, input_ct), output_ct), accumulator), local_stream)| {
+                                        cuda_programmable_bootstrap_128_lwe_ciphertext(
+                                            input_ct,
+                                            output_ct,
+                                            accumulator,
+                                            LweCiphertextCount(1),
+                                            gpu_keys_vec[i].bsk.as_ref().unwrap(),
+                                            local_stream,
+                                        );
+                                    },
+                                )
+                        },
+                        criterion::BatchSize::SmallInput,
+                    );
+                });
+            }
+        };
+
+        let params_record = CryptoParametersRecord {
+            lwe_dimension: Some(lwe_dimension),
+            glwe_dimension: Some(glwe_dimension),
+            polynomial_size: Some(polynomial_size),
+            lwe_noise_distribution: Some(lwe_noise_distribution),
+            glwe_noise_distribution: Some(glwe_noise_distribution),
+            pbs_base_log: Some(pbs_base_log),
+            pbs_level: Some(pbs_level),
+            ciphertext_modulus: Some(ct_modulus_u64),
+            ..Default::default()
+        };
+
+        let bit_size = (message_modulus as u32).ilog2();
+        write_to_json(
+            &bench_id,
+            params_record,
+            params_name,
+            "pbs",
+            &OperatorType::Atomic,
+            bit_size,
+            vec![bit_size],
+        );
+    }
+
+    pub fn cuda_pbs128_group() {
+        let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
+        cuda_pbs_128(&mut criterion);
+    }
+}
+
+#[cfg(feature = "gpu")]
+use cuda::cuda_pbs128_group;
+
+pub fn pbs128_group() {
+    let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
+    pbs_128(&mut criterion);
+}
+
+#[cfg(feature = "gpu")]
+fn go_through_gpu_bench_groups() {
+    cuda_pbs128_group();
+}
+
+#[cfg(not(feature = "gpu"))]
+fn go_through_cpu_bench_groups() {
+    pbs128_group();
+}
+fn main() {
+    #[cfg(feature = "gpu")]
+    go_through_gpu_bench_groups();
+    #[cfg(not(feature = "gpu"))]
+    go_through_cpu_bench_groups();
+
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
--- a/tfhe-benchmark/benches/high_level_api/bench.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench.rs
@@ -0,0 +1,134 @@
+use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use criterion::{black_box, Criterion};
+use rand::prelude::*;
+use std::fmt::Write;
+use std::ops::*;
+use tfhe::prelude::*;
+use tfhe::{
+    set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheUint10, FheUint12,
+    FheUint128, FheUint14, FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
+};
+
+fn bench_fhe_type<FheType>(c: &mut Criterion, client_key: &ClientKey, type_name: &str)
+where
+    FheType: FheEncrypt<u128, ClientKey>,
+    for<'a> &'a FheType: Add<&'a FheType, Output = FheType>
+        + Sub<&'a FheType, Output = FheType>
+        + Mul<&'a FheType, Output = FheType>
+        + BitAnd<&'a FheType, Output = FheType>
+        + BitOr<&'a FheType, Output = FheType>
+        + BitXor<&'a FheType, Output = FheType>
+        + Shl<&'a FheType, Output = FheType>
+        + Shr<&'a FheType, Output = FheType>
+        + RotateLeft<&'a FheType, Output = FheType>
+        + RotateRight<&'a FheType, Output = FheType>
+        + OverflowingAdd<&'a FheType, Output = FheType>
+        + OverflowingSub<&'a FheType, Output = FheType>,
+{
+    let mut bench_group = c.benchmark_group(type_name);
+
+    let mut rng = thread_rng();
+
+    let lhs = FheType::encrypt(rng.gen(), client_key);
+    let rhs = FheType::encrypt(rng.gen(), client_key);
+
+    let mut name = String::with_capacity(255);
+
+    write!(name, "add({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs + &rhs)));
+    name.clear();
+
+    write!(name, "overflowing_add({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| {
+        b.iter(|| black_box((&lhs).overflowing_add(&rhs)))
+    });
+    name.clear();
+
+    write!(name, "overflowing_sub({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(lhs.overflowing_sub(&rhs))));
+    name.clear();
+
+    write!(name, "sub({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs - &rhs)));
+    name.clear();
+
+    write!(name, "mul({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs * &rhs)));
+    name.clear();
+
+    write!(name, "bitand({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs & &rhs)));
+    name.clear();
+
+    write!(name, "bitor({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs | &rhs)));
+    name.clear();
+
+    write!(name, "bitxor({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs ^ &rhs)));
+    name.clear();
+
+    write!(name, "shl({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs << &rhs)));
+    name.clear();
+
+    write!(name, "shr({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs >> &rhs)));
+    name.clear();
+
+    write!(name, "rotl({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_left(&rhs))));
+    name.clear();
+
+    write!(name, "rotr({type_name}, {type_name})").unwrap();
+    bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_right(&rhs))));
+    name.clear();
+}
+
+macro_rules! bench_type {
+    ($fhe_type:ident) => {
+        ::paste::paste! {
+            fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type));
+            }
+        }
+    };
+}
+
+bench_type!(FheUint2);
+bench_type!(FheUint4);
+bench_type!(FheUint6);
+bench_type!(FheUint8);
+bench_type!(FheUint10);
+bench_type!(FheUint12);
+bench_type!(FheUint14);
+bench_type!(FheUint16);
+bench_type!(FheUint32);
+bench_type!(FheUint64);
+bench_type!(FheUint128);
+
+fn main() {
+    let config =
+        ConfigBuilder::with_custom_parameters(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
+            .build();
+    let cks = ClientKey::generate(config);
+    let compressed_sks = CompressedServerKey::new(&cks);
+
+    set_server_key(compressed_sks.decompress());
+
+    let mut c = Criterion::default().configure_from_args();
+
+    bench_fhe_uint2(&mut c, &cks);
+    bench_fhe_uint4(&mut c, &cks);
+    bench_fhe_uint6(&mut c, &cks);
+    bench_fhe_uint8(&mut c, &cks);
+    bench_fhe_uint10(&mut c, &cks);
+    bench_fhe_uint12(&mut c, &cks);
+    bench_fhe_uint14(&mut c, &cks);
+    bench_fhe_uint16(&mut c, &cks);
+    bench_fhe_uint32(&mut c, &cks);
+    bench_fhe_uint64(&mut c, &cks);
+    bench_fhe_uint128(&mut c, &cks);
+
+    c.final_summary();
+}
--- a/tfhe-benchmark/benches/high_level_api/dex.rs
+++ b/tfhe-benchmark/benches/high_level_api/dex.rs
@@ -0,0 +1,539 @@
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(not(feature = "gpu"))]
+use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::utilities::configure_gpu;
+use benchmark::utilities::{write_to_json, OperatorType};
+use criterion::measurement::WallTime;
+use criterion::{BenchmarkGroup, Criterion};
+use rand::prelude::*;
+use rand::thread_rng;
+use std::ops::{Add, Div, Mul, Sub};
+use tfhe::keycache::NamedParam;
+use tfhe::prelude::*;
+#[cfg(not(feature = "gpu"))]
+use tfhe::{set_server_key, CompressedServerKey};
+use tfhe::{ClientKey, ConfigBuilder, FheBool, FheUint128, FheUint64};
+
+pub(crate) fn transfer_whitepaper<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let has_enough_funds = (from_amount).ge(amount);
+
+    let mut new_to_amount = to_amount + amount;
+    new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount);
+
+    let mut new_from_amount = from_amount - amount;
+    new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount);
+
+    (new_from_amount, new_to_amount)
+}
+
+#[allow(clippy::too_many_arguments)]
+fn swap_request<FheType>(
+    from_balance_0: &FheType,
+    from_balance_1: &FheType,
+    current_dex_balance_0: &FheType,
+    current_dex_balance_1: &FheType,
+    to_balance_0: &FheType,
+    to_balance_1: &FheType,
+    total_dex_token_0_in: &FheType,
+    total_dex_token_1_in: &FheType,
+    amount0: &FheType,
+    amount1: &FheType,
+) -> (FheType, FheType, FheType, FheType)
+where
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Clone,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let (_, new_current_balance_0) =
+        transfer_whitepaper(from_balance_0, current_dex_balance_0, amount0);
+    let (_, new_current_balance_1) =
+        transfer_whitepaper(from_balance_1, current_dex_balance_1, amount1);
+    let sent0 = &new_current_balance_0 - current_dex_balance_0;
+    let sent1 = &new_current_balance_1 - current_dex_balance_1;
+    let pending_0_in = to_balance_0 + &sent0;
+    let pending_total_token_0_in = total_dex_token_0_in + &sent0;
+    let pending_1_in = to_balance_1 + &sent1;
+    let pending_total_token_1_in = total_dex_token_1_in + &sent1;
+    (
+        pending_0_in,
+        pending_total_token_0_in,
+        pending_1_in,
+        pending_total_token_1_in,
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn swap_claim<FheType, BigFheType>(
+    pending_0_in: &FheType,
+    pending_1_in: &FheType,
+    total_dex_token_0_in: u64,
+    total_dex_token_1_in: u64,
+    total_dex_token_0_out: u64,
+    total_dex_token_1_out: u64,
+    old_balance_0: &FheType,
+    old_balance_1: &FheType,
+    current_dex_balance_0: &FheType,
+    current_dex_balance_1: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: CastFrom<FheBool>
+        + for<'a> FheOrd<&'a FheType>
+        + CastFrom<BigFheType>
+        + Clone
+        + Add<Output = FheType>,
+    BigFheType: CastFrom<FheType> + Mul<u128, Output = BigFheType> + Div<u128, Output = BigFheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let mut new_balance_0 = old_balance_0.clone();
+    let mut new_balance_1 = old_balance_1.clone();
+    if total_dex_token_1_in != 0 {
+        let big_pending_1_in = BigFheType::cast_from(pending_1_in.clone());
+        let big_amount_0_out =
+            (big_pending_1_in * total_dex_token_0_out as u128) / total_dex_token_1_in as u128;
+        let amount_0_out = FheType::cast_from(big_amount_0_out);
+        let (_, new_balance_0_tmp) =
+            transfer_whitepaper(current_dex_balance_0, old_balance_0, &amount_0_out);
+        new_balance_0 = new_balance_0_tmp;
+    }
+    if total_dex_token_0_in != 0 {
+        let big_pending_0_in = BigFheType::cast_from(pending_0_in.clone());
+        let big_amount_1_out =
+            (big_pending_0_in * total_dex_token_1_out as u128) / total_dex_token_0_in as u128;
+        let amount_1_out = FheType::cast_from(big_amount_1_out);
+        let (_, new_balance_1_tmp) =
+            transfer_whitepaper(current_dex_balance_1, old_balance_1, &amount_1_out);
+        new_balance_1 = new_balance_1_tmp;
+    }
+
+    (new_balance_0, new_balance_1)
+}
+
+#[cfg(feature = "pbs-stats")]
+mod pbs_stats {
+    use super::*;
+    use std::fs::{File, OpenOptions};
+    use std::io::Write;
+    use std::path::Path;
+
+    fn write_result(file: &mut File, name: &str, value: usize) {
+        let line = format!("{name},{value}\n");
+        let error_message = format!("cannot write {name} result into file");
+        file.write_all(line.as_bytes()).expect(&error_message);
+    }
+
+    pub fn print_swap_request_pbs_counts<FheType, F>(
+        client_key: &ClientKey,
+        type_name: &str,
+        swap_request_func: F,
+    ) where
+        FheType: FheEncrypt<u64, ClientKey>,
+        F: for<'a> Fn(
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+        ) -> (FheType, FheType, FheType, FheType),
+    {
+        let mut rng = thread_rng();
+
+        let from_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let from_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_dex_token_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_dex_token_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        #[cfg(feature = "gpu")]
+        configure_gpu(client_key);
+
+        tfhe::reset_pbs_count();
+        let (_, _, _, _) = swap_request_func(
+            &from_balance_0,
+            &from_balance_1,
+            &current_dex_balance_0,
+            &current_dex_balance_1,
+            &to_balance_0,
+            &to_balance_1,
+            &total_dex_token_0,
+            &total_dex_token_1,
+            &amount_0,
+            &amount_1,
+        );
+        let count = tfhe::get_pbs_count();
+
+        println!("ERC20 swap request/::{type_name}: {count} PBS");
+
+        let params = client_key.computation_parameters();
+
+        let test_name = if cfg!(feature = "gpu") {
+            format!("hlapi::cuda::dex::swap_request::pbs_count::{type_name}")
+        } else {
+            format!("hlapi::dex::swap_request::pbs_count::{type_name}")
+        };
+
+        let results_file = Path::new("dex_swap_request_pbs_count.csv");
+        if !results_file.exists() {
+            File::create(results_file).expect("create results file failed");
+        }
+        let mut file = OpenOptions::new()
+            .append(true)
+            .open(results_file)
+            .expect("cannot open results file");
+
+        write_result(&mut file, &test_name, count as usize);
+
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "pbs-count",
+            &OperatorType::Atomic,
+            0,
+            vec![],
+        );
+    }
+    pub fn print_swap_claim_pbs_counts<FheType, F>(
+        client_key: &ClientKey,
+        type_name: &str,
+        swap_claim_func: F,
+    ) where
+        FheType: FheEncrypt<u64, ClientKey>,
+        F: for<'a> Fn(
+            &'a FheType,
+            &'a FheType,
+            u64,
+            u64,
+            u64,
+            u64,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+        ) -> (FheType, FheType),
+    {
+        let mut rng = thread_rng();
+
+        let pending_0_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let pending_1_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_dex_token_0_in = rng.gen::<u64>();
+        let total_dex_token_1_in = rng.gen::<u64>();
+        let total_dex_token_0_out = rng.gen::<u64>();
+        let total_dex_token_1_out = rng.gen::<u64>();
+        let old_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let old_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        #[cfg(feature = "gpu")]
+        configure_gpu(client_key);
+
+        tfhe::reset_pbs_count();
+        let (_, _) = swap_claim_func(
+            &pending_0_in,
+            &pending_1_in,
+            total_dex_token_0_in,
+            total_dex_token_1_in,
+            total_dex_token_0_out,
+            total_dex_token_1_out,
+            &old_balance_0,
+            &old_balance_1,
+            &current_dex_balance_0,
+            &current_dex_balance_1,
+        );
+        let count = tfhe::get_pbs_count();
+
+        println!("ERC20 swap claim/::{type_name}: {count} PBS");
+
+        let params = client_key.computation_parameters();
+
+        let test_name = if cfg!(feature = "gpu") {
+            format!("hlapi::cuda::dex::swap_claim::pbs_count::{type_name}")
+        } else {
+            format!("hlapi::dex::swap_claim::pbs_count::{type_name}")
+        };
+
+        let results_file = Path::new("dex_swap_claim_pbs_count.csv");
+        if !results_file.exists() {
+            File::create(results_file).expect("create results file failed");
+        }
+        let mut file = OpenOptions::new()
+            .append(true)
+            .open(results_file)
+            .expect("cannot open results file");
+
+        write_result(&mut file, &test_name, count as usize);
+
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "pbs-count",
+            &OperatorType::Atomic,
+            0,
+            vec![],
+        );
+    }
+}
+
+fn bench_swap_request_latency<FheType, F>(
+    c: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    swap_request_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey>,
+    F: for<'a> Fn(
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+    ) -> (FheType, FheType, FheType, FheType),
+{
+    #[cfg(feature = "gpu")]
+    configure_gpu(client_key);
+
+    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
+    c.bench_function(&bench_id, |b| {
+        let mut rng = thread_rng();
+
+        let from_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let from_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_token_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_token_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        b.iter(|| {
+            let (_, _, _, _) = swap_request_func(
+                &from_balance_0,
+                &from_balance_1,
+                &current_balance_0,
+                &current_balance_1,
+                &to_balance_0,
+                &to_balance_1,
+                &total_token_0,
+                &total_token_1,
+                &amount_0,
+                &amount_1,
+            );
+        })
+    });
+
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "dex-swap-request",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+fn bench_swap_claim_latency<FheType, F>(
+    c: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    swap_claim_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey>,
+    F: for<'a> Fn(
+        &'a FheType,
+        &'a FheType,
+        u64,
+        u64,
+        u64,
+        u64,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+    ) -> (FheType, FheType),
+{
+    #[cfg(feature = "gpu")]
+    configure_gpu(client_key);
+
+    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
+    c.bench_function(&bench_id, |b| {
+        let mut rng = thread_rng();
+
+        let pending_0_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let pending_1_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_token_0_in = rng.gen::<u64>();
+        let total_token_1_in = rng.gen::<u64>();
+        let total_token_0_out = rng.gen::<u64>();
+        let total_token_1_out = rng.gen::<u64>();
+        let old_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let old_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        b.iter(|| {
+            let (_, _) = swap_claim_func(
+                &pending_0_in,
+                &pending_1_in,
+                total_token_0_in,
+                total_token_1_in,
+                total_token_0_out,
+                total_token_1_out,
+                &old_balance_0,
+                &old_balance_1,
+                &current_balance_0,
+                &current_balance_1,
+            );
+        })
+    });
+
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "dex-swap-claim",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+#[cfg(feature = "pbs-stats")]
+use crate::pbs_stats::print_swap_claim_pbs_counts;
+#[cfg(feature = "pbs-stats")]
+use crate::pbs_stats::print_swap_request_pbs_counts;
+
+#[cfg(not(feature = "gpu"))]
+fn main() {
+    let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let cks = ClientKey::generate(config);
+    let compressed_sks = CompressedServerKey::new(&cks);
+
+    let sks = compressed_sks.decompress();
+
+    rayon::broadcast(|_| set_server_key(sks.clone()));
+    set_server_key(sks);
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::dex";
+
+    // FheUint64 PBS counts
+    // We don't run multiple times since every input is encrypted
+    // PBS count is always the same
+    #[cfg(feature = "pbs-stats")]
+    {
+        print_swap_request_pbs_counts(&cks, "FheUint64", swap_request::<FheUint64>);
+        print_swap_claim_pbs_counts(&cks, "FheUint64", swap_claim::<FheUint64, FheUint128>);
+    }
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_swap_request_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_request",
+            swap_request::<FheUint64>,
+        );
+        bench_swap_claim_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_claim",
+            swap_claim::<FheUint64, FheUint128>,
+        );
+
+        group.finish();
+    }
+
+    c.final_summary();
+}
+
+#[cfg(feature = "gpu")]
+fn main() {
+    let params = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let cks = ClientKey::generate(config);
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::cuda::dex";
+
+    // FheUint64 PBS counts
+    // We don't run multiple times since every input is encrypted
+    // PBS count is always the same
+    #[cfg(feature = "pbs-stats")]
+    {
+        print_swap_request_pbs_counts(&cks, "FheUint64", swap_request::<FheUint64>);
+        print_swap_claim_pbs_counts(&cks, "FheUint64", swap_claim::<FheUint64, FheUint128>);
+    }
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_swap_request_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_request",
+            swap_request::<FheUint64>,
+        );
+        bench_swap_claim_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_claim",
+            swap_claim::<FheUint64, FheUint128>,
+        );
+
+        group.finish();
+    }
+
+    c.final_summary();
+}
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -0,0 +1,595 @@
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(not(feature = "gpu"))]
+use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::utilities::configure_gpu;
+use benchmark::utilities::{write_to_json, OperatorType};
+use criterion::measurement::WallTime;
+use criterion::{BenchmarkGroup, Criterion, Throughput};
+use rand::prelude::*;
+use rand::thread_rng;
+use rayon::prelude::*;
+use std::ops::{Add, Mul, Sub};
+use tfhe::keycache::NamedParam;
+use tfhe::prelude::*;
+#[cfg(feature = "gpu")]
+use tfhe::GpuIndex;
+use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBool, FheUint64};
+
+/// Transfer as written in the original FHEvm white-paper,
+/// it uses a comparison to check if the sender has enough,
+/// and cmuxes based on the comparison result
+pub fn transfer_whitepaper<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let has_enough_funds = (from_amount).ge(amount);
+
+    let mut new_to_amount = to_amount + amount;
+    new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount);
+
+    let mut new_from_amount = from_amount - amount;
+    new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount);
+
+    (new_from_amount, new_to_amount)
+}
+
+/// This one also uses a comparison, but it leverages the 'boolean' multiplication
+/// instead of cmuxes, so it is faster
+fn transfer_no_cmux<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: Add<Output = FheType> + CastFrom<FheBool> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType:
+        Add<Output = FheType> + Sub<Output = FheType> + Mul<FheType, Output = FheType>,
+{
+    let has_enough_funds = (from_amount).ge(amount);
+
+    let amount = amount * FheType::cast_from(has_enough_funds);
+
+    let new_to_amount = to_amount + &amount;
+    let new_from_amount = from_amount - &amount;
+
+    (new_from_amount, new_to_amount)
+}
+
+/// This one uses overflowing sub to remove the need for comparison
+/// it also uses the 'boolean' multiplication
+fn transfer_overflow<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: CastFrom<FheBool> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<FheType, Output = FheType>
+        + OverflowingSub<&'a FheType, Output = FheType>
+        + Mul<FheType, Output = FheType>,
+{
+    let (new_from, did_not_have_enough) = (from_amount).overflowing_sub(amount);
+
+    let new_from_amount = did_not_have_enough.if_then_else(from_amount, &new_from);
+
+    let had_enough_funds = !did_not_have_enough;
+    let new_to_amount = to_amount + (amount * FheType::cast_from(had_enough_funds));
+
+    (new_from_amount, new_to_amount)
+}
+
+/// This ones uses both overflowing_add/sub to check that both
+/// the sender has enough funds, and the receiver will not overflow its balance
+fn transfer_safe<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    for<'a> &'a FheType: OverflowingSub<&'a FheType, Output = FheType>
+        + OverflowingAdd<&'a FheType, Output = FheType>,
+    FheBool: IfThenElse<FheType>,
+{
+    let (new_from, did_not_have_enough_funds) = (from_amount).overflowing_sub(amount);
+    let (new_to, did_not_have_enough_space) = (to_amount).overflowing_add(amount);
+
+    let something_not_ok = did_not_have_enough_funds | did_not_have_enough_space;
+
+    let new_from_amount = something_not_ok.if_then_else(from_amount, &new_from);
+    let new_to_amount = something_not_ok.if_then_else(to_amount, &new_to);
+
+    (new_from_amount, new_to_amount)
+}
+
+#[cfg(feature = "pbs-stats")]
+mod pbs_stats {
+    use super::*;
+    use std::fs::{File, OpenOptions};
+    use std::io::Write;
+    use std::path::Path;
+
+    fn write_result(file: &mut File, name: &str, value: usize) {
+        let line = format!("{name},{value}\n");
+        let error_message = format!("cannot write {name} result into file");
+        file.write_all(line.as_bytes()).expect(&error_message);
+    }
+
+    pub fn print_transfer_pbs_counts<FheType, F>(
+        client_key: &ClientKey,
+        type_name: &str,
+        fn_name: &str,
+        transfer_func: F,
+    ) where
+        FheType: FheEncrypt<u64, ClientKey>,
+        F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType),
+    {
+        let mut rng = thread_rng();
+
+        let from_amount = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_amount = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        #[cfg(feature = "gpu")]
+        configure_gpu(client_key);
+
+        tfhe::reset_pbs_count();
+        let (_, _) = transfer_func(&from_amount, &to_amount, &amount);
+        let count = tfhe::get_pbs_count();
+
+        println!("ERC20 transfer/{fn_name}::{type_name}: {count} PBS");
+
+        let params = client_key.computation_parameters();
+
+        let test_name = if cfg!(feature = "gpu") {
+            format!("hlapi::cuda::erc20::pbs_count::{fn_name}::{type_name}")
+        } else {
+            format!("hlapi::erc20::pbs_count::{fn_name}::{type_name}")
+        };
+
+        let results_file = Path::new("erc20_pbs_count.csv");
+        if !results_file.exists() {
+            File::create(results_file).expect("create results file failed");
+        }
+        let mut file = OpenOptions::new()
+            .append(true)
+            .open(results_file)
+            .expect("cannot open results file");
+
+        write_result(&mut file, &test_name, count as usize);
+
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "pbs-count",
+            &OperatorType::Atomic,
+            0,
+            vec![],
+        );
+    }
+}
+
+fn bench_transfer_latency<FheType, F>(
+    c: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    transfer_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey>,
+    F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType),
+{
+    #[cfg(feature = "gpu")]
+    configure_gpu(client_key);
+
+    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
+    c.bench_function(&bench_id, |b| {
+        let mut rng = thread_rng();
+
+        let from_amount = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_amount = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        b.iter(|| {
+            let (_, _) = transfer_func(&from_amount, &to_amount, &amount);
+        })
+    });
+
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "erc20-transfer",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+#[cfg(not(feature = "gpu"))]
+fn bench_transfer_throughput<FheType, F>(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    transfer_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey> + Send + Sync,
+    F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType) + Sync,
+{
+    let mut rng = thread_rng();
+
+    for num_elems in [10, 100, 500] {
+        group.throughput(Throughput::Elements(num_elems));
+        let bench_id =
+            format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems");
+        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
+            let from_amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+            let to_amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+            let amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+
+            b.iter(|| {
+                from_amounts
+                    .par_iter()
+                    .zip(to_amounts.par_iter().zip(amounts.par_iter()))
+                    .for_each(|(from_amount, (to_amount, amount))| {
+                        let (_, _) = transfer_func(from_amount, to_amount, amount);
+                    })
+            })
+        });
+
+        let params = client_key.computation_parameters();
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            params,
+            params.name(),
+            "erc20-transfer",
+            &OperatorType::Atomic,
+            64,
+            vec![],
+        );
+    }
+}
+#[cfg(feature = "gpu")]
+fn cuda_bench_transfer_throughput<FheType, F>(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    transfer_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey> + Send + Sync,
+    F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType) + Sync,
+{
+    let mut rng = thread_rng();
+    let num_gpus = get_number_of_gpus() as u64;
+    let compressed_server_key = CompressedServerKey::new(client_key);
+
+    let sks_vec = (0..num_gpus)
+        .map(|i| compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32)))
+        .collect::<Vec<_>>();
+
+    for num_elems in [10 * num_gpus, 100 * num_gpus, 500 * num_gpus] {
+        group.throughput(Throughput::Elements(num_elems));
+        let bench_id =
+            format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems");
+        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
+            let from_amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+            let to_amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+            let amounts = (0..num_elems)
+                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+                .collect::<Vec<_>>();
+
+            let num_streams_per_gpu = 8; // Hard coded stream value for FheUint64
+            let chunk_size = (num_elems / num_gpus) as usize;
+
+            b.iter(|| {
+                from_amounts
+                    .par_chunks(chunk_size) // Split into chunks of num_gpus
+                    .zip(
+                        to_amounts
+                            .par_chunks(chunk_size)
+                            .zip(amounts.par_chunks(chunk_size)),
+                    ) // Zip with the other data
+                    .enumerate() // Get the index for GPU
+                    .for_each(
+                        |(i, (from_amount_gpu_i, (to_amount_gpu_i, amount_gpu_i)))| {
+                            // Process chunks within each GPU
+                            let stream_chunk_size = from_amount_gpu_i.len() / num_streams_per_gpu;
+                            from_amount_gpu_i
+                                .par_chunks(stream_chunk_size)
+                                .zip(to_amount_gpu_i.par_chunks(stream_chunk_size))
+                                .zip(amount_gpu_i.par_chunks(stream_chunk_size))
+                                .for_each(
+                                    |((from_amount_chunk, to_amount_chunk), amount_chunk)| {
+                                        // Set the server key for the current GPU
+                                        set_server_key(sks_vec[i].clone());
+                                        // Parallel iteration over the chunks of data
+                                        from_amount_chunk
+                                            .iter()
+                                            .zip(to_amount_chunk.iter().zip(amount_chunk.iter()))
+                                            .for_each(|(from_amount, (to_amount, amount))| {
+                                                transfer_func(from_amount, to_amount, amount);
+                                            });
+                                    },
+                                );
+                        },
+                    );
+            });
+        });
+
+        let params = client_key.computation_parameters();
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            params,
+            params.name(),
+            "erc20-transfer",
+            &OperatorType::Atomic,
+            64,
+            vec![],
+        );
+    }
+}
+
+#[cfg(feature = "pbs-stats")]
+use pbs_stats::print_transfer_pbs_counts;
+#[cfg(feature = "gpu")]
+use tfhe::core_crypto::gpu::get_number_of_gpus;
+
+#[cfg(not(feature = "gpu"))]
+fn main() {
+    let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let cks = ClientKey::generate(config);
+    let compressed_sks = CompressedServerKey::new(&cks);
+
+    let sks = compressed_sks.decompress();
+
+    rayon::broadcast(|_| set_server_key(sks.clone()));
+    set_server_key(sks);
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::erc20";
+
+    // FheUint64 PBS counts
+    // We don't run multiple times since every input is encrypted
+    // PBS count is always the same
+    #[cfg(feature = "pbs-stats")]
+    {
+        print_transfer_pbs_counts(
+            &cks,
+            "FheUint64",
+            "transfer::whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
+        print_transfer_pbs_counts(
+            &cks,
+            "FheUint64",
+            "transfer::overflow",
+            transfer_overflow::<FheUint64>,
+        );
+        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
+    }
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::no_cmux",
+            transfer_no_cmux::<FheUint64>,
+        );
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::overflow",
+            transfer_overflow::<FheUint64>,
+        );
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::safe",
+            transfer_safe::<FheUint64>,
+        );
+
+        group.finish();
+    }
+
+    // FheUint64 Throughput
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::no_cmux",
+            transfer_no_cmux::<FheUint64>,
+        );
+        bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::overflow",
+            transfer_overflow::<FheUint64>,
+        );
+        bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::safe",
+            transfer_safe::<FheUint64>,
+        );
+
+        group.finish();
+    }
+
+    c.final_summary();
+}
+
+#[cfg(feature = "gpu")]
+fn main() {
+    let params = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let cks = ClientKey::generate(config);
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::cuda::erc20";
+
+    // FheUint64 PBS counts
+    // We don't run multiple times since every input is encrypted
+    // PBS count is always the same
+    #[cfg(feature = "pbs-stats")]
+    {
+        print_transfer_pbs_counts(
+            &cks,
+            "FheUint64",
+            "transfer::whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
+        print_transfer_pbs_counts(
+            &cks,
+            "FheUint64",
+            "transfer::overflow",
+            transfer_overflow::<FheUint64>,
+        );
+        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
+    }
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::no_cmux",
+            transfer_no_cmux::<FheUint64>,
+        );
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::overflow",
+            transfer_overflow::<FheUint64>,
+        );
+        bench_transfer_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::safe",
+            transfer_safe::<FheUint64>,
+        );
+
+        group.finish();
+    }
+
+    // FheUint64 Throughput
+    {
+        let mut group = c.benchmark_group(bench_name);
+        cuda_bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::whitepaper",
+            transfer_whitepaper::<FheUint64>,
+        );
+        cuda_bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::no_cmux",
+            transfer_no_cmux::<FheUint64>,
+        );
+        cuda_bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::overflow",
+            transfer_overflow::<FheUint64>,
+        );
+        cuda_bench_transfer_throughput(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "transfer::safe",
+            transfer_safe::<FheUint64>,
+        );
+        group.finish();
+    }
+
+    c.final_summary();
+}
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
--- a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
@@ -0,0 +1,368 @@
+use benchmark::params_aliases::*;
+use benchmark::utilities::{
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
+};
+use criterion::{black_box, criterion_group, Criterion, Throughput};
+use rayon::prelude::*;
+use std::cmp::max;
+use tfhe::integer::ciphertext::CompressedCiphertextListBuilder;
+use tfhe::integer::{ClientKey, RadixCiphertext};
+use tfhe::keycache::NamedParam;
+use tfhe::{get_pbs_count, reset_pbs_count};
+
+fn cpu_glwe_packing(c: &mut Criterion) {
+    let bench_name = "integer::packing_compression";
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(30));
+
+    let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let comp_param = BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let cks = ClientKey::new(param);
+
+    let private_compression_key = cks.new_compression_private_key(comp_param);
+
+    let (compression_key, decompression_key) =
+        cks.new_compression_decompression_keys(&private_compression_key);
+
+    let log_message_modulus = param.message_modulus.0.ilog2() as usize;
+
+    for bit_size in [
+        2,
+        8,
+        16,
+        32,
+        64,
+        128,
+        256,
+        comp_param.lwe_per_glwe.0 * log_message_modulus,
+    ] {
+        assert_eq!(bit_size % log_message_modulus, 0);
+        let num_blocks = bit_size / log_message_modulus;
+
+        let bench_id_pack;
+        let bench_id_unpack;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let ct = cks.encrypt_radix(0_u32, num_blocks);
+
+                let mut builder = CompressedCiphertextListBuilder::new();
+
+                builder.push(ct);
+
+                bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
+                bench_group.bench_function(&bench_id_pack, |b| {
+                    b.iter(|| {
+                        let compressed = builder.build(&compression_key);
+
+                        _ = black_box(compressed);
+                    })
+                });
+
+                let compressed = builder.build(&compression_key);
+
+                bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
+                bench_group.bench_function(&bench_id_unpack, |b| {
+                    b.iter(|| {
+                        let unpacked: RadixCiphertext =
+                            compressed.get(0, &decompression_key).unwrap().unwrap();
+
+                        _ = black_box(unpacked);
+                    })
+                });
+            }
+            BenchmarkType::Throughput => {
+                // Execute the operation once to know its cost.
+                let ct = cks.encrypt_radix(0_u32, num_blocks);
+                let mut builder = CompressedCiphertextListBuilder::new();
+                builder.push(ct);
+                let compressed = builder.build(&compression_key);
+
+                reset_pbs_count();
+                let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap();
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                let num_block =
+                    (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
+                let elements = throughput_num_threads(num_block, pbs_count);
+                // FIXME thread usage seemed to be somewhat more "efficient".
+                //  For example, with bit_size = 2, my laptop is only using around 2/3 of the
+                // available threads  Thread usage increases with bit_size = 8 but
+                // still isn't fully loaded.
+                bench_group.throughput(Throughput::Elements(elements));
+
+                let builders = (0..elements)
+                    .map(|_| {
+                        let ct = cks.encrypt_radix(0_u32, num_blocks);
+                        let mut builder = CompressedCiphertextListBuilder::new();
+                        builder.push(ct);
+
+                        builder
+                    })
+                    .collect::<Vec<_>>();
+
+                bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
+                bench_group.bench_function(&bench_id_pack, |b| {
+                    b.iter(|| {
+                        builders.par_iter().for_each(|builder| {
+                            builder.build(&compression_key);
+                        })
+                    })
+                });
+
+                let compressed = builders
+                    .iter()
+                    .map(|builder| builder.build(&compression_key))
+                    .collect::<Vec<_>>();
+
+                bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
+                bench_group.bench_function(&bench_id_unpack, |b| {
+                    b.iter(|| {
+                        compressed.par_iter().for_each(|comp| {
+                            comp.get::<RadixCiphertext>(0, &decompression_key)
+                                .unwrap()
+                                .unwrap();
+                        })
+                    })
+                });
+            }
+        }
+
+        write_to_json::<u64, _>(
+            &bench_id_pack,
+            (comp_param, param),
+            comp_param.name(),
+            "pack",
+            &OperatorType::Atomic,
+            bit_size as u32,
+            vec![param.message_modulus.0.ilog2(); num_blocks],
+        );
+
+        write_to_json::<u64, _>(
+            &bench_id_unpack,
+            (comp_param, param),
+            comp_param.name(),
+            "unpack",
+            &OperatorType::Atomic,
+            bit_size as u32,
+            vec![param.message_modulus.0.ilog2(); num_blocks],
+        );
+    }
+
+    bench_group.finish()
+}
+
+#[cfg(feature = "gpu")]
+mod cuda {
+    use super::*;
+    use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
+    use std::cmp::max;
+    use tfhe::core_crypto::gpu::CudaStreams;
+    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
+    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+    use tfhe::integer::gpu::gen_keys_radix_gpu;
+
+    fn gpu_glwe_packing(c: &mut Criterion) {
+        let bench_name = "integer::cuda::packing_compression";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        let stream = CudaStreams::new_multi_gpu();
+
+        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let comp_param =
+            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
+
+        let cks = ClientKey::new(param);
+        let private_compression_key = cks.new_compression_private_key(comp_param);
+
+        for bit_size in [
+            2,
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            comp_param.lwe_per_glwe.0 * log_message_modulus,
+        ] {
+            assert_eq!(bit_size % log_message_modulus, 0);
+            let num_blocks = bit_size / log_message_modulus;
+
+            let bench_id_pack;
+            let bench_id_unpack;
+
+            // Generate and convert compression keys
+            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
+            let (compressed_compression_key, compressed_decompression_key) =
+                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
+            let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
+            let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
+                radix_cks.parameters().glwe_dimension(),
+                radix_cks.parameters().polynomial_size(),
+                radix_cks.parameters().message_modulus(),
+                radix_cks.parameters().carry_modulus(),
+                radix_cks.parameters().ciphertext_modulus(),
+                &stream,
+            );
+
+            match get_bench_type() {
+                BenchmarkType::Latency => {
+                    // Encrypt
+                    let ct = cks.encrypt_radix(0_u32, num_blocks);
+                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+
+                    // Benchmark
+                    let mut builder = CudaCompressedCiphertextListBuilder::new();
+
+                    builder.push(d_ct, &stream);
+
+                    bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
+                    bench_group.bench_function(&bench_id_pack, |b| {
+                        b.iter(|| {
+                            let compressed = builder.build(&cuda_compression_key, &stream);
+
+                            _ = black_box(compressed);
+                        })
+                    });
+
+                    let compressed = builder.build(&cuda_compression_key, &stream);
+
+                    bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
+                    bench_group.bench_function(&bench_id_unpack, |b| {
+                        b.iter(|| {
+                            let unpacked: CudaUnsignedRadixCiphertext = compressed
+                                .get(0, &cuda_decompression_key, &stream)
+                                .unwrap()
+                                .unwrap();
+
+                            _ = black_box(unpacked);
+                        })
+                    });
+                }
+                BenchmarkType::Throughput => {
+                    // Execute the operation once to know its cost.
+                    let (cpu_compression_key, cpu_decompression_key) =
+                        cks.new_compression_decompression_keys(&private_compression_key);
+                    let ct = cks.encrypt_radix(0_u32, num_blocks);
+                    let mut builder = CompressedCiphertextListBuilder::new();
+                    builder.push(ct);
+                    let compressed = builder.build(&cpu_compression_key);
+
+                    reset_pbs_count();
+                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
+                    let _: RadixCiphertext =
+                        compressed.get(0, &cpu_decompression_key).unwrap().unwrap();
+                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                    let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
+                        .ceil() as usize;
+                    let elements = throughput_num_threads(num_block, pbs_count);
+                    bench_group.throughput(Throughput::Elements(elements));
+
+                    // Encrypt
+                    let ct = cks.encrypt_radix(0_u32, num_blocks);
+                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+
+                    // Benchmark
+                    let mut builder = CudaCompressedCiphertextListBuilder::new();
+
+                    builder.push(d_ct, &stream);
+
+                    let builders = (0..elements)
+                        .map(|_| {
+                            let ct = cks.encrypt_radix(0_u32, num_blocks);
+                            let d_ct =
+                                CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+                            let mut builder = CudaCompressedCiphertextListBuilder::new();
+                            builder.push(d_ct, &stream);
+
+                            builder
+                        })
+                        .collect::<Vec<_>>();
+
+                    let local_streams = cuda_local_streams(num_block, elements as usize);
+
+                    bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
+                    bench_group.bench_function(&bench_id_pack, |b| {
+                        b.iter(|| {
+                            builders.par_iter().zip(local_streams.par_iter()).for_each(
+                                |(builder, local_stream)| {
+                                    builder.build(&cuda_compression_key, local_stream);
+                                },
+                            )
+                        })
+                    });
+
+                    let compressed = builders
+                        .iter()
+                        .map(|builder| builder.build(&cuda_compression_key, &stream))
+                        .collect::<Vec<_>>();
+
+                    bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
+                    bench_group.bench_function(&bench_id_unpack, |b| {
+                        b.iter(|| {
+                            compressed
+                                .par_iter()
+                                .zip(local_streams.par_iter())
+                                .for_each(|(comp, local_stream)| {
+                                    comp.get::<CudaUnsignedRadixCiphertext>(
+                                        0,
+                                        &cuda_decompression_key,
+                                        local_stream,
+                                    )
+                                    .unwrap()
+                                    .unwrap();
+                                })
+                        })
+                    });
+                }
+            }
+
+            write_to_json::<u64, _>(
+                &bench_id_pack,
+                (comp_param, param),
+                comp_param.name(),
+                "pack",
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus.0.ilog2(); num_blocks],
+            );
+
+            write_to_json::<u64, _>(
+                &bench_id_unpack,
+                (comp_param, param),
+                comp_param.name(),
+                "unpack",
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus.0.ilog2(); num_blocks],
+            );
+        }
+
+        bench_group.finish()
+    }
+
+    criterion_group!(gpu_glwe_packing2, gpu_glwe_packing);
+}
+
+criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
+
+#[cfg(feature = "gpu")]
+use cuda::gpu_glwe_packing2;
+
+fn main() {
+    #[cfg(feature = "gpu")]
+    gpu_glwe_packing2();
+    #[cfg(not(feature = "gpu"))]
+    cpu_glwe_packing2();
+
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/benches/integer/oprf.rs
+++ b/tfhe-benchmark/benches/integer/oprf.rs
@@ -0,0 +1,85 @@
+use benchmark::params::ParamsAndNumBlocksIter;
+use benchmark::utilities::{
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
+};
+use criterion::{black_box, Criterion, Throughput};
+use rayon::prelude::*;
+use std::cmp::max;
+use tfhe::integer::keycache::KEY_CACHE;
+use tfhe::integer::IntegerKeyKind;
+use tfhe::keycache::NamedParam;
+use tfhe::{get_pbs_count, reset_pbs_count};
+use tfhe_csprng::seeders::Seed;
+
+pub fn unsigned_oprf(c: &mut Criterion) {
+    let bench_name = "integer::unsigned_oprf";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(30));
+
+    for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+        let param_name = param.name();
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                bench_group.bench_function(&bench_id, |b| {
+                    let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                    b.iter(|| {
+                        _ = black_box(
+                            sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                                Seed(0),
+                                bit_size as u64,
+                                num_block as u64,
+                            ),
+                        );
+                    })
+                });
+            }
+            BenchmarkType::Throughput => {
+                let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+
+                // Execute the operation once to know its cost.
+                reset_pbs_count();
+                sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                    Seed(0),
+                    bit_size as u64,
+                    num_block as u64,
+                );
+                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                let elements = throughput_num_threads(num_block, pbs_count);
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    b.iter(|| {
+                        (0..elements).into_par_iter().for_each(|_| {
+                            sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
+                                Seed(0),
+                                bit_size as u64,
+                                num_block as u64,
+                            );
+                        })
+                    })
+                });
+            }
+        }
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            param,
+            param.name(),
+            "oprf",
+            &OperatorType::Atomic,
+            bit_size as u32,
+            vec![param.message_modulus().0.ilog2(); num_block],
+        );
+    }
+
+    bench_group.finish()
+}
--- a/tfhe-benchmark/benches/integer/signed_bench.rs
+++ b/tfhe-benchmark/benches/integer/signed_bench.rs
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
@@ -0,0 +1,785 @@
+use benchmark::params_aliases::*;
+use benchmark::utilities::{
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
+};
+use criterion::{criterion_group, Criterion, Throughput};
+use rand::prelude::*;
+use rayon::prelude::*;
+use std::cmp::max;
+use std::fs::{File, OpenOptions};
+use std::io::Write;
+use std::path::Path;
+use tfhe::core_crypto::prelude::LweCiphertextCount;
+use tfhe::integer::key_switching_key::KeySwitchingKey;
+use tfhe::integer::parameters::IntegerCompactCiphertextListExpansionMode;
+use tfhe::integer::{ClientKey, CompactPrivateKey, CompactPublicKey, ServerKey};
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::parameters::*;
+use tfhe::zk::{CompactPkeCrs, ZkComputeLoad};
+use tfhe::{get_pbs_count, reset_pbs_count};
+
+fn write_result(file: &mut File, name: &str, value: usize) {
+    let line = format!("{name},{value}\n");
+    let error_message = format!("cannot write {name} result into file");
+    file.write_all(line.as_bytes()).expect(&error_message);
+}
+
+fn pke_zk_proof(c: &mut Criterion) {
+    let bench_name = "zk::pke_zk_proof";
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));
+
+    for (param_pke, _param_casting, param_fhe) in [
+        (
+            BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        ),
+        (
+            BENCH_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
+            BENCH_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        ),
+    ] {
+        let param_name = param_fhe.name();
+        let param_name = param_name.as_str();
+        let cks = ClientKey::new(param_fhe);
+        let sks = ServerKey::new_radix_server_key(&cks);
+        let compact_private_key = CompactPrivateKey::new(param_pke);
+        let pk = CompactPublicKey::new(&compact_private_key);
+        // Kept for consistency
+        let _casting_key =
+            KeySwitchingKey::new((&compact_private_key, None), (&cks, &sks), _param_casting);
+
+        // We have a use case with 320 bits of metadata
+        let mut metadata = [0u8; (320 / u8::BITS) as usize];
+        let mut rng = rand::thread_rng();
+        metadata.fill_with(|| rng.gen());
+
+        let zk_vers = param_pke.zk_scheme;
+
+        for bits in [64usize, 640, 1280, 4096] {
+            assert_eq!(bits % 64, 0);
+            // Packing, so we take the message and carry modulus to compute our block count
+            let num_block = 64usize.div_ceil(
+                (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
+            );
+
+            use rand::Rng;
+            let mut rng = rand::thread_rng();
+
+            let fhe_uint_count = bits / 64;
+
+            let crs = CompactPkeCrs::from_shortint_params(
+                param_pke,
+                LweCiphertextCount(num_block * fhe_uint_count),
+            )
+            .unwrap();
+
+            for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
+                let zk_load = match compute_load {
+                    ZkComputeLoad::Proof => "compute_load_proof",
+                    ZkComputeLoad::Verify => "compute_load_verify",
+                };
+
+                let bench_id;
+
+                match get_bench_type() {
+                    BenchmarkType::Latency => {
+                        bench_id = format!(
+                            "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                        bench_group.bench_function(&bench_id, |b| {
+                            let input_msg = rng.gen::<u64>();
+                            let messages = vec![input_msg; fhe_uint_count];
+
+                            b.iter(|| {
+                                let _ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                    .extend(messages.iter().copied())
+                                    .build_with_proof_packed(&crs, &metadata, compute_load)
+                                    .unwrap();
+                            })
+                        });
+                    }
+                    BenchmarkType::Throughput => {
+                        // Execute the operation once to know its cost.
+                        let input_msg = rng.gen::<u64>();
+                        let messages = vec![input_msg; fhe_uint_count];
+
+                        reset_pbs_count();
+                        let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                            .extend(messages.iter().copied())
+                            .build_with_proof_packed(&crs, &metadata, compute_load);
+                        let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                        let elements = throughput_num_threads(num_block, pbs_count);
+                        bench_group.throughput(Throughput::Elements(elements));
+
+                        bench_id = format!(
+                            "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                        bench_group.bench_function(&bench_id, |b| {
+                            let messages = (0..elements)
+                                .map(|_| {
+                                    let input_msg = rng.gen::<u64>();
+                                    vec![input_msg; fhe_uint_count]
+                                })
+                                .collect::<Vec<_>>();
+
+                            b.iter(|| {
+                                messages.par_iter().for_each(|msg| {
+                                    tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                        .extend(msg.iter().copied())
+                                        .build_with_proof_packed(&crs, &metadata, compute_load)
+                                        .unwrap();
+                                })
+                            })
+                        });
+                    }
+                }
+
+                let shortint_params: PBSParameters = param_fhe.into();
+
+                write_to_json::<u64, _>(
+                    &bench_id,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_proof",
+                    &OperatorType::Atomic,
+                    shortint_params.message_modulus().0 as u32,
+                    vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                );
+            }
+        }
+    }
+
+    bench_group.finish()
+}
+
+criterion_group!(zk_proof, pke_zk_proof);
+
+fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
+    let bench_name = "zk::pke_zk_verify";
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));
+
+    File::create(results_file).expect("create results file failed");
+    let mut file = OpenOptions::new()
+        .append(true)
+        .open(results_file)
+        .expect("cannot open results file");
+
+    for (param_pke, param_casting, param_fhe) in [
+        (
+            BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        ),
+        (
+            BENCH_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
+            BENCH_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        ),
+    ] {
+        let param_name = param_fhe.name();
+        let param_name = param_name.as_str();
+        let cks = ClientKey::new(param_fhe);
+        let sks = ServerKey::new_radix_server_key(&cks);
+        let compact_private_key = CompactPrivateKey::new(param_pke);
+        let pk = CompactPublicKey::new(&compact_private_key);
+        let casting_key =
+            KeySwitchingKey::new((&compact_private_key, None), (&cks, &sks), param_casting);
+
+        // We have a use case with 320 bits of metadata
+        let mut metadata = [0u8; (320 / u8::BITS) as usize];
+        let mut rng = rand::thread_rng();
+        metadata.fill_with(|| rng.gen());
+
+        let zk_vers = param_pke.zk_scheme;
+
+        for bits in [64usize, 640, 1280, 4096] {
+            assert_eq!(bits % 64, 0);
+            // Packing, so we take the message and carry modulus to compute our block count
+            let num_block = 64usize.div_ceil(
+                (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
+            );
+
+            use rand::Rng;
+            let mut rng = rand::thread_rng();
+
+            let fhe_uint_count = bits / 64;
+
+            println!("Generating CRS... ");
+            let crs = CompactPkeCrs::from_shortint_params(
+                param_pke,
+                LweCiphertextCount(num_block * fhe_uint_count),
+            )
+            .unwrap();
+
+            let shortint_params: PBSParameters = param_fhe.into();
+
+            let crs_data = bincode::serialize(&crs).unwrap();
+
+            println!("CRS size: {}", crs_data.len());
+
+            let test_name = format!("zk::crs_sizes::{param_name}_{bits}_bits_packed_ZK{zk_vers:?}");
+
+            write_result(&mut file, &test_name, crs_data.len());
+            write_to_json::<u64, _>(
+                &test_name,
+                shortint_params,
+                param_name,
+                "pke_zk_crs",
+                &OperatorType::Atomic,
+                0,
+                vec![],
+            );
+
+            for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
+                let zk_load = match compute_load {
+                    ZkComputeLoad::Proof => "compute_load_proof",
+                    ZkComputeLoad::Verify => "compute_load_verify",
+                };
+
+                let bench_id_verify;
+                let bench_id_verify_and_expand;
+
+                match get_bench_type() {
+                    BenchmarkType::Latency => {
+                        bench_id_verify = format!(
+                            "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                        bench_id_verify_and_expand = format!(
+                            "{bench_name}_and_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                        let input_msg = rng.gen::<u64>();
+                        let messages = vec![input_msg; fhe_uint_count];
+
+                        println!("Generating proven ciphertext ({zk_load})... ");
+                        let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                            .extend(messages.iter().copied())
+                            .build_with_proof_packed(&crs, &metadata, compute_load)
+                            .unwrap();
+
+                        let proven_ciphertext_list_serialized = bincode::serialize(&ct1).unwrap();
+
+                        println!(
+                            "proven list size: {}",
+                            proven_ciphertext_list_serialized.len()
+                        );
+
+                        let test_name = format!(
+                            "zk::proven_list_size::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                        write_result(
+                            &mut file,
+                            &test_name,
+                            proven_ciphertext_list_serialized.len(),
+                        );
+                        write_to_json::<u64, _>(
+                            &test_name,
+                            shortint_params,
+                            param_name,
+                            "pke_zk_proof",
+                            &OperatorType::Atomic,
+                            0,
+                            vec![],
+                        );
+
+                        let proof_size = ct1.proof_size();
+                        println!("proof size: {}", ct1.proof_size());
+
+                        let test_name =
+                            format!("zk::proof_sizes::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}");
+
+                        write_result(&mut file, &test_name, proof_size);
+                        write_to_json::<u64, _>(
+                            &test_name,
+                            shortint_params,
+                            param_name,
+                            "pke_zk_proof",
+                            &OperatorType::Atomic,
+                            0,
+                            vec![],
+                        );
+
+                        bench_group.bench_function(&bench_id_verify, |b| {
+                            b.iter(|| {
+                                let _ret = ct1.verify(&crs, &pk, &metadata);
+                            });
+                        });
+
+                        bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                            b.iter(|| {
+                                let _ret = ct1
+                                    .verify_and_expand(
+                                       &crs,
+                                        &pk,
+                                        &metadata,
+                                        IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary(
+                                            casting_key.as_view(),
+                                        ),
+                                    )
+                                    .unwrap();
+                            });
+                        });
+                    }
+                    BenchmarkType::Throughput => {
+                        // In throughput mode object sizes are not recorded.
+
+                        // Execute the operation once to know its cost.
+                        let input_msg = rng.gen::<u64>();
+                        let messages = vec![input_msg; fhe_uint_count];
+                        let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                            .extend(messages.iter().copied())
+                            .build_with_proof_packed(&crs, &metadata, compute_load)
+                            .unwrap();
+
+                        reset_pbs_count();
+                        let _ = ct1.verify_and_expand(
+                            &crs,
+                            &pk,
+                            &metadata,
+                            IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary(
+                                casting_key.as_view(),
+                            ),
+                        );
+                        let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                        let elements = throughput_num_threads(num_block, pbs_count);
+                        bench_group.throughput(Throughput::Elements(elements));
+
+                        bench_id_verify = format!(
+                            "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                        bench_id_verify_and_expand = format!(
+                            "{bench_name}_and_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                        println!("Generating proven ciphertexts list ({zk_load})... ");
+                        let cts = (0..elements)
+                            .map(|_| {
+                                let input_msg = rng.gen::<u64>();
+                                let messages = vec![input_msg; fhe_uint_count];
+                                tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                    .extend(messages.iter().copied())
+                                    .build_with_proof_packed(&crs, &metadata, compute_load)
+                                    .unwrap()
+                            })
+                            .collect::<Vec<_>>();
+
+                        bench_group.bench_function(&bench_id_verify, |b| {
+                            b.iter(|| {
+                                cts.par_iter().for_each(|ct1| {
+                                    ct1.verify(&crs, &pk, &metadata);
+                                })
+                            });
+                        });
+
+                        bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                            b.iter(|| {
+                                cts.par_iter().for_each(|ct1| {
+                                    ct1
+                                        .verify_and_expand(
+                                            &crs,
+                                            &pk,
+                                            &metadata,
+                                            IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary(
+                                                casting_key.as_view(),
+                                            ),
+                                        )
+                                        .unwrap();
+                                })
+                            });
+                        });
+                    }
+                }
+
+                write_to_json::<u64, _>(
+                    &bench_id_verify,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_verify",
+                    &OperatorType::Atomic,
+                    shortint_params.message_modulus().0 as u32,
+                    vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                );
+
+                write_to_json::<u64, _>(
+                    &bench_id_verify_and_expand,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_verify_and_expand",
+                    &OperatorType::Atomic,
+                    shortint_params.message_modulus().0 as u32,
+                    vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                );
+            }
+        }
+    }
+
+    bench_group.finish()
+}
+
+#[cfg(all(feature = "gpu", feature = "zk-pok"))]
+mod cuda {
+    use super::*;
+    use benchmark::utilities::{cuda_local_keys, cuda_local_streams};
+    use criterion::BatchSize;
+    use itertools::Itertools;
+    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
+    use tfhe::integer::gpu::key_switching_key::CudaKeySwitchingKey;
+    use tfhe::integer::gpu::zk::CudaProvenCompactCiphertextList;
+    use tfhe::integer::gpu::CudaServerKey;
+    use tfhe::integer::CompressedServerKey;
+
+    fn gpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
+        let bench_name = "zk::cuda::pke_zk_verify";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+
+        let streams = CudaStreams::new_multi_gpu();
+
+        File::create(results_file).expect("create results file failed");
+        let mut file = OpenOptions::new()
+            .append(true)
+            .open(results_file)
+            .expect("cannot open results file");
+
+        for (param_pke, param_ksk, param_fhe) in [(
+            PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            PARAM_GPU_MULTI_BIT_GROUP_4_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )] {
+            let param_name = param_fhe.name();
+            let param_name = param_name.as_str();
+            let cks = ClientKey::new(param_fhe);
+            let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+            let gpu_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+            let compact_private_key = CompactPrivateKey::new(param_pke);
+            let pk = CompactPublicKey::new(&compact_private_key);
+            let d_ksk = CudaKeySwitchingKey::new(
+                (&compact_private_key, None),
+                (&cks, &gpu_sks),
+                param_ksk,
+                &streams,
+            );
+
+            // We have a use case with 320 bits of metadata
+            let mut metadata = [0u8; (320 / u8::BITS) as usize];
+            let mut rng = rand::thread_rng();
+            metadata.fill_with(|| rng.gen());
+
+            let zk_vers = param_pke.zk_scheme;
+
+            for bits in [64usize, 640, 1280, 4096] {
+                assert_eq!(bits % 64, 0);
+                // Packing, so we take the message and carry modulus to compute our block count
+                let num_block = 64usize.div_ceil(
+                    (param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
+                );
+
+                use rand::Rng;
+                let mut rng = rand::thread_rng();
+
+                let fhe_uint_count = bits / 64;
+
+                println!("Generating CRS... ");
+                let crs = CompactPkeCrs::from_shortint_params(
+                    param_pke,
+                    LweCiphertextCount(num_block * fhe_uint_count),
+                )
+                .unwrap();
+
+                let shortint_params: PBSParameters = param_fhe.into();
+
+                let crs_data = bincode::serialize(&crs).unwrap();
+
+                println!("CRS size: {}", crs_data.len());
+
+                let test_name =
+                    format!("zk::crs_sizes::{param_name}_{bits}_bits_packed_ZK{zk_vers:?}");
+
+                write_result(&mut file, &test_name, crs_data.len());
+                write_to_json::<u64, _>(
+                    &test_name,
+                    shortint_params,
+                    param_name,
+                    "pke_zk_crs",
+                    &OperatorType::Atomic,
+                    0,
+                    vec![],
+                );
+
+                for compute_load in [ZkComputeLoad::Proof, ZkComputeLoad::Verify] {
+                    let zk_load = match compute_load {
+                        ZkComputeLoad::Proof => "compute_load_proof",
+                        ZkComputeLoad::Verify => "compute_load_verify",
+                    };
+
+                    let bench_id_verify;
+                    let bench_id_verify_and_expand;
+                    let bench_id_expand_without_verify;
+
+                    match get_bench_type() {
+                        BenchmarkType::Latency => {
+                            bench_id_verify = format!(
+                            "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_verify_and_expand = format!(
+                            "{bench_name}_and_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_expand_without_verify = format!(
+                            "{bench_name}_only_expand::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                            let input_msg = rng.gen::<u64>();
+                            let messages = vec![input_msg; fhe_uint_count];
+
+                            println!("Generating proven ciphertext ({zk_load})... ");
+                            let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                .extend(messages.iter().copied())
+                                .build_with_proof_packed(&crs, &metadata, compute_load)
+                                .unwrap();
+                            let gpu_ct1 =
+                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
+                                &ct1, &streams,
+                            );
+
+                            let proven_ciphertext_list_serialized =
+                                bincode::serialize(&ct1).unwrap();
+
+                            println!(
+                                "proven list size: {}",
+                                proven_ciphertext_list_serialized.len()
+                            );
+
+                            let test_name = format!(
+                            "zk::proven_list_size::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+
+                            write_result(
+                                &mut file,
+                                &test_name,
+                                proven_ciphertext_list_serialized.len(),
+                            );
+                            write_to_json::<u64, _>(
+                                &test_name,
+                                shortint_params,
+                                param_name,
+                                "pke_zk_proof",
+                                &OperatorType::Atomic,
+                                0,
+                                vec![],
+                            );
+
+                            let proof_size = ct1.proof_size();
+                            println!("proof size: {}", ct1.proof_size());
+
+                            let test_name =
+                            format!("zk::proof_sizes::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}");
+
+                            write_result(&mut file, &test_name, proof_size);
+                            write_to_json::<u64, _>(
+                                &test_name,
+                                shortint_params,
+                                param_name,
+                                "pke_zk_proof",
+                                &OperatorType::Atomic,
+                                0,
+                                vec![],
+                            );
+
+                            bench_group.bench_function(&bench_id_verify, |b| {
+                                b.iter(|| {
+                                    let _ret = ct1.verify(&crs, &pk, &metadata);
+                                });
+                            });
+
+                            bench_group.bench_function(&bench_id_expand_without_verify, |b| {
+                                b.iter(|| {
+                                    let _ret = gpu_ct1
+                                        .expand_without_verification(&d_ksk, &streams)
+                                        .unwrap();
+                                });
+                            });
+
+                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                                b.iter(|| {
+                                    let _ret = gpu_ct1
+                                        .verify_and_expand(&crs, &pk, &metadata, &d_ksk, &streams)
+                                        .unwrap();
+                                });
+                            });
+                        }
+                        BenchmarkType::Throughput => {
+                            let gpu_sks_vec = cuda_local_keys(&cks);
+                            let gpu_count = get_number_of_gpus() as usize;
+
+                            // Execute the operation once to know its cost.
+                            let input_msg = rng.gen::<u64>();
+                            let messages = vec![input_msg; fhe_uint_count];
+                            let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                .extend(messages.iter().copied())
+                                .build_with_proof_packed(&crs, &metadata, compute_load)
+                                .unwrap();
+                            let gpu_ct1 =
+                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
+                                &ct1, &streams,
+                            );
+
+                            reset_pbs_count();
+                            let _ =
+                                gpu_ct1.verify_and_expand(&crs, &pk, &metadata, &d_ksk, &streams);
+                            let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
+
+                            let elements = throughput_num_threads(num_block, pbs_count);
+                            bench_group.throughput(Throughput::Elements(elements));
+
+                            bench_id_verify = format!(
+                            "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_verify_and_expand = format!(
+                            "{bench_name}_and_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            bench_id_expand_without_verify = format!(
+                            "{bench_name}_only_expand::throughput::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
+                        );
+                            println!("Generating proven ciphertexts list ({zk_load})... ");
+                            let cts = (0..elements)
+                                .map(|_| {
+                                    let input_msg = rng.gen::<u64>();
+                                    let messages = vec![input_msg; fhe_uint_count];
+                                    tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
+                                        .extend(messages.iter().copied())
+                                        .build_with_proof_packed(&crs, &metadata, compute_load)
+                                        .unwrap()
+                                })
+                                .collect::<Vec<_>>();
+
+                            let local_streams = cuda_local_streams(num_block, elements as usize);
+                            let d_ksk_vec = gpu_sks_vec
+                                .par_iter()
+                                .zip(local_streams.par_iter())
+                                .map(|(gpu_sks, local_stream)| {
+                                    CudaKeySwitchingKey::new(
+                                        (&compact_private_key, None),
+                                        (&cks, gpu_sks),
+                                        param_ksk,
+                                        local_stream,
+                                    )
+                                })
+                                .collect::<Vec<_>>();
+
+                            assert_eq!(d_ksk_vec.len(), gpu_count);
+
+                            bench_group.bench_function(&bench_id_verify, |b| {
+                                b.iter(|| {
+                                    cts.par_iter().for_each(|ct1| {
+                                        ct1.verify(&crs, &pk, &metadata);
+                                    })
+                                });
+                            });
+
+                            bench_group.bench_function(&bench_id_expand_without_verify, |b| {
+                                    let setup_encrypted_values = || {
+                                        let local_streams = cuda_local_streams(num_block, elements as usize);
+
+                                        let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
+                                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
+                                                ct, &local_streams[i],
+                                            )
+                                        }).collect_vec();
+
+                                        (gpu_cts, local_streams)
+                                    };
+
+                                b.iter_batched(setup_encrypted_values, |(gpu_cts, local_streams)| {
+                                    gpu_cts.par_iter()
+                                        .zip(local_streams.par_iter())
+                                        .enumerate()
+                                        .for_each(|(i, (gpu_ct, local_stream))| {
+                                            gpu_ct
+                                                .expand_without_verification(&d_ksk_vec[i % gpu_count], local_stream)
+                                                .unwrap();
+                                    });
+                                }, BatchSize::SmallInput);
+                            });
+
+                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
+                                    let setup_encrypted_values = || {
+                                        let local_streams = cuda_local_streams(num_block, elements as usize);
+
+                                        let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
+                                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
+                                                ct, &local_streams[i],
+                                            )
+                                        }).collect_vec();
+
+                                        (gpu_cts, local_streams)
+                                    };
+
+                                b.iter_batched(setup_encrypted_values, |(gpu_cts, local_streams)| {
+                                       gpu_cts
+                                           .par_iter()
+                                           .zip(local_streams.par_iter())
+                                           .for_each(|(gpu_ct, local_stream)| {
+                                               gpu_ct
+                                                   .verify_and_expand(
+                                                       &crs, &pk, &metadata, &d_ksk, local_stream
+                                                   )
+                                                   .unwrap();
+                                    });
+                                }, BatchSize::SmallInput);
+                            });
+                        }
+                    }
+
+                    write_to_json::<u64, _>(
+                        &bench_id_verify_and_expand,
+                        shortint_params,
+                        param_name,
+                        "pke_zk_verify_and_expand",
+                        &OperatorType::Atomic,
+                        shortint_params.message_modulus().0 as u32,
+                        vec![shortint_params.message_modulus().0.ilog2(); num_block],
+                    );
+                }
+            }
+        }
+
+        bench_group.finish()
+    }
+
+    pub fn gpu_zk_verify() {
+        let results_file = Path::new("gpu_pke_zk_crs_sizes.csv");
+        let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
+        gpu_pke_zk_verify(&mut criterion, results_file);
+    }
+}
+
+pub fn zk_verify() {
+    let results_file = Path::new("pke_zk_crs_sizes.csv");
+    let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
+    cpu_pke_zk_verify(&mut criterion, results_file);
+}
+
+#[cfg(all(feature = "gpu", feature = "zk-pok"))]
+use crate::cuda::gpu_zk_verify;
+
+fn main() {
+    #[cfg(all(feature = "gpu", feature = "zk-pok"))]
+    gpu_zk_verify();
+    #[cfg(not(feature = "gpu"))]
+    zk_verify();
+
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/benches/shortint/bench.rs
+++ b/tfhe-benchmark/benches/shortint/bench.rs
@@ -0,0 +1,697 @@
+use benchmark::params::{
+    raw_benchmark_parameters, SHORTINT_BENCH_PARAMS_GAUSSIAN, SHORTINT_BENCH_PARAMS_TUNIFORM,
+    SHORTINT_MULTI_BIT_BENCH_PARAMS,
+};
+use benchmark::utilities::{write_to_json, OperatorType};
+use criterion::{criterion_group, Criterion};
+use rand::Rng;
+use std::env;
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::keycache::KEY_CACHE;
+use tfhe::shortint::parameters::*;
+use tfhe::shortint::{Ciphertext, CompressedServerKey, ServerKey};
+
+fn bench_server_key_unary_function<F>(
+    c: &mut Criterion,
+    bench_name: &str,
+    display_name: &str,
+    unary_op: F,
+) where
+    F: Fn(&ServerKey, &mut Ciphertext),
+{
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    for param in raw_benchmark_parameters().iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let (cks, sks) = (keys.client_key(), keys.server_key());
+
+        let mut rng = rand::thread_rng();
+
+        let modulus = cks.parameters.message_modulus().0;
+
+        let clear_text = rng.gen::<u64>() % modulus;
+
+        let mut ct = cks.encrypt(clear_text);
+
+        let bench_id = format!("{bench_name}::{}", param.name());
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                unary_op(sks, &mut ct);
+            })
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            display_name,
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish()
+}
+
+fn bench_server_key_binary_function<F>(
+    c: &mut Criterion,
+    bench_name: &str,
+    display_name: &str,
+    binary_op: F,
+) where
+    F: Fn(&ServerKey, &mut Ciphertext, &mut Ciphertext),
+{
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    for param in raw_benchmark_parameters().iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let (cks, sks) = (keys.client_key(), keys.server_key());
+
+        let mut rng = rand::thread_rng();
+
+        let modulus = cks.parameters.message_modulus().0;
+
+        let clear_0 = rng.gen::<u64>() % modulus;
+        let clear_1 = rng.gen::<u64>() % modulus;
+
+        let mut ct_0 = cks.encrypt(clear_0);
+        let mut ct_1 = cks.encrypt(clear_1);
+
+        let bench_id = format!("{bench_name}::{}", param.name());
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                binary_op(sks, &mut ct_0, &mut ct_1);
+            })
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            display_name,
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish()
+}
+
+fn bench_server_key_binary_scalar_function<F>(
+    c: &mut Criterion,
+    bench_name: &str,
+    display_name: &str,
+    binary_op: F,
+) where
+    F: Fn(&ServerKey, &mut Ciphertext, u8),
+{
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    for param in raw_benchmark_parameters().iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let (cks, sks) = (keys.client_key(), keys.server_key());
+
+        let mut rng = rand::thread_rng();
+
+        let modulus = cks.parameters.message_modulus().0;
+
+        let clear_0 = rng.gen::<u64>() % modulus;
+        let clear_1 = rng.gen::<u64>() % modulus;
+
+        let mut ct_0 = cks.encrypt(clear_0);
+
+        let bench_id = format!("{bench_name}::{}", param.name());
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                binary_op(sks, &mut ct_0, clear_1 as u8);
+            })
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            display_name,
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish()
+}
+
+fn bench_server_key_binary_scalar_division_function<F>(
+    c: &mut Criterion,
+    bench_name: &str,
+    display_name: &str,
+    binary_op: F,
+) where
+    F: Fn(&ServerKey, &mut Ciphertext, u8),
+{
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    for param in raw_benchmark_parameters().iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let (cks, sks) = (keys.client_key(), keys.server_key());
+
+        let mut rng = rand::thread_rng();
+
+        let modulus = cks.parameters.message_modulus().0;
+        assert_ne!(modulus, 1);
+
+        let clear_0 = rng.gen::<u64>() % modulus;
+        let mut clear_1 = rng.gen::<u64>() % modulus;
+        while clear_1 == 0 {
+            clear_1 = rng.gen::<u64>() % modulus;
+        }
+
+        let mut ct_0 = cks.encrypt(clear_0);
+
+        let bench_id = format!("{bench_name}::{}", param.name());
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                binary_op(sks, &mut ct_0, clear_1 as u8);
+            })
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            display_name,
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish()
+}
+
+fn carry_extract_bench(c: &mut Criterion) {
+    let mut bench_group = c.benchmark_group("carry_extract");
+
+    for param in raw_benchmark_parameters().iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let (cks, sks) = (keys.client_key(), keys.server_key());
+
+        let mut rng = rand::thread_rng();
+
+        let modulus = cks.parameters.message_modulus().0;
+
+        let clear_0 = rng.gen::<u64>() % modulus;
+
+        let ct_0 = cks.encrypt(clear_0);
+
+        let bench_id = format!("shortint::carry_extract::{}", param.name());
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let _ = sks.carry_extract(&ct_0);
+            })
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            "carry_extract",
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish()
+}
+
+fn programmable_bootstrapping_bench(c: &mut Criterion) {
+    let mut bench_group = c.benchmark_group("programmable_bootstrap");
+
+    for param in raw_benchmark_parameters().iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let (cks, sks) = (keys.client_key(), keys.server_key());
+
+        let mut rng = rand::thread_rng();
+
+        let modulus = cks.parameters.message_modulus().0;
+
+        let acc = sks.generate_lookup_table(|x| x);
+
+        let clear_0 = rng.gen::<u64>() % modulus;
+
+        let ctxt = cks.encrypt(clear_0);
+
+        let bench_id = format!("shortint::programmable_bootstrap::{}", param.name());
+
+        bench_group.bench_function(&bench_id, |b| {
+            b.iter(|| {
+                let _ = sks.apply_lookup_table(&ctxt, &acc);
+            })
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            "pbs",
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish();
+}
+
+fn server_key_from_compressed_key(c: &mut Criterion) {
+    let mut bench_group = c.benchmark_group("uncompress_key");
+    bench_group
+        .sample_size(10)
+        .measurement_time(std::time::Duration::from_secs(60));
+
+    let mut params = SHORTINT_BENCH_PARAMS_TUNIFORM
+        .iter()
+        .chain(SHORTINT_BENCH_PARAMS_GAUSSIAN.iter())
+        .map(|p| (*p).into())
+        .collect::<Vec<PBSParameters>>();
+    let multi_bit_params = SHORTINT_MULTI_BIT_BENCH_PARAMS
+        .iter()
+        .map(|p| (*p).into())
+        .collect::<Vec<PBSParameters>>();
+    params.extend(&multi_bit_params);
+
+    for param in params.iter() {
+        let keys = KEY_CACHE.get_from_param(*param);
+        let sks_compressed = CompressedServerKey::new(keys.client_key());
+
+        let bench_id = format!("shortint::uncompress_key::{}", param.name());
+
+        bench_group.bench_function(&bench_id, |b| {
+            let clone_compressed_key = || sks_compressed.clone();
+
+            b.iter_batched(
+                clone_compressed_key,
+                |sks_cloned| {
+                    let _ = sks_cloned.decompress();
+                },
+                criterion::BatchSize::PerIteration,
+            )
+        });
+
+        write_to_json::<u64, _>(
+            &bench_id,
+            *param,
+            param.name(),
+            "uncompress_key",
+            &OperatorType::Atomic,
+            param.message_modulus().0.ilog2(),
+            vec![param.message_modulus().0.ilog2()],
+        );
+    }
+
+    bench_group.finish();
+}
+
+macro_rules! define_server_key_unary_bench_fn (
+    (method_name:$server_key_method:ident, display_name:$name:ident) => {
+        fn $server_key_method(c: &mut Criterion) {
+            bench_server_key_unary_function(
+                c,
+                concat!("shortint::", stringify!($server_key_method)),
+                stringify!($name),
+                |server_key, lhs| {
+                    let _ = server_key.$server_key_method(lhs);},
+            )
+        }
+    }
+);
+
+macro_rules! define_server_key_bench_fn (
+    (method_name:$server_key_method:ident, display_name:$name:ident) => {
+        fn $server_key_method(c: &mut Criterion) {
+            bench_server_key_binary_function(
+                c,
+                concat!("shortint::", stringify!($server_key_method)),
+                stringify!($name),
+                |server_key, lhs, rhs| {
+                    let _ = server_key.$server_key_method(lhs, rhs);},
+            )
+        }
+    }
+);
+
+macro_rules! define_server_key_scalar_bench_fn (
+    (method_name:$server_key_method:ident, display_name:$name:ident) => {
+        fn $server_key_method(c: &mut Criterion) {
+            bench_server_key_binary_scalar_function(
+                c,
+                concat!("shortint::", stringify!($server_key_method)),
+                stringify!($name),
+                |server_key, lhs, rhs| {
+                    let _ = server_key.$server_key_method(lhs, rhs);},
+            )
+        }
+    }
+);
+
+macro_rules! define_server_key_scalar_div_bench_fn (
+    (method_name:$server_key_method:ident, display_name:$name:ident) => {
+        fn $server_key_method(c: &mut Criterion) {
+            bench_server_key_binary_scalar_division_function(
+                c,
+                concat!("shortint::", stringify!($server_key_method)),
+                stringify!($name),
+                |server_key, lhs, rhs| {
+                    let _ = server_key.$server_key_method(lhs, rhs);},
+            )
+        }
+    }
+);
+
+macro_rules! define_custom_bench_fn (
+    (function_name:$function:ident) => {
+        fn $function(c: &mut Criterion) {
+            ::paste::paste! {
+                [<$function _bench>](
+                    c,
+                )
+            }
+        }
+    }
+);
+
+define_server_key_unary_bench_fn!(
+    method_name: unchecked_neg,
+    display_name: negation
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_add,
+    display_name: add
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_sub,
+    display_name: sub
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_mul_lsb,
+    display_name: mul
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_mul_msb,
+    display_name: mul
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_div,
+    display_name: div
+);
+define_server_key_bench_fn!(
+    method_name: smart_bitand,
+    display_name: bitand
+);
+define_server_key_bench_fn!(
+    method_name: smart_bitor,
+    display_name: bitor
+);
+define_server_key_bench_fn!(
+    method_name: smart_bitxor,
+    display_name: bitxor
+);
+define_server_key_bench_fn!(
+    method_name: smart_add,
+    display_name: add
+);
+define_server_key_bench_fn!(
+    method_name: smart_sub,
+    display_name: sub
+);
+define_server_key_bench_fn!(
+    method_name: smart_mul_lsb,
+    display_name: mul
+);
+define_server_key_bench_fn!(
+    method_name: bitand,
+    display_name: bitand
+);
+define_server_key_bench_fn!(
+    method_name: bitor,
+    display_name: bitor
+);
+define_server_key_bench_fn!(
+    method_name: bitxor,
+    display_name: bitxor
+);
+define_server_key_bench_fn!(
+    method_name: add,
+    display_name: add
+);
+define_server_key_bench_fn!(
+    method_name: sub,
+    display_name: sub
+);
+define_server_key_bench_fn!(
+    method_name: mul,
+    display_name: mul
+);
+define_server_key_bench_fn!(
+    method_name: div,
+    display_name: div
+);
+define_server_key_bench_fn!(
+    method_name: greater,
+    display_name: greater_than
+);
+define_server_key_bench_fn!(
+    method_name: greater_or_equal,
+    display_name: greater_or_equal
+);
+define_server_key_bench_fn!(
+    method_name: less,
+    display_name: less_than
+);
+define_server_key_bench_fn!(
+    method_name: less_or_equal,
+    display_name: less_or_equal
+);
+define_server_key_bench_fn!(
+    method_name: equal,
+    display_name: equal
+);
+define_server_key_bench_fn!(
+    method_name: not_equal,
+    display_name: not_equal
+);
+define_server_key_unary_bench_fn!(
+    method_name: neg,
+    display_name: negation
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_greater,
+    display_name: greater_than
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_less,
+    display_name: less_than
+);
+define_server_key_bench_fn!(
+    method_name: unchecked_equal,
+    display_name: equal
+);
+
+define_server_key_scalar_bench_fn!(
+    method_name: unchecked_scalar_add,
+    display_name: add
+);
+define_server_key_scalar_bench_fn!(
+    method_name: unchecked_scalar_sub,
+    display_name: sub
+);
+define_server_key_scalar_bench_fn!(
+    method_name: unchecked_scalar_mul,
+    display_name: mul
+);
+define_server_key_scalar_bench_fn!(
+    method_name: unchecked_scalar_left_shift,
+    display_name: left_shift
+);
+define_server_key_scalar_bench_fn!(
+    method_name: unchecked_scalar_right_shift,
+    display_name: right_shift
+);
+
+define_server_key_scalar_div_bench_fn!(
+    method_name: unchecked_scalar_div,
+    display_name: div
+);
+define_server_key_scalar_div_bench_fn!(
+    method_name: unchecked_scalar_mod,
+    display_name: modulo
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_add,
+    display_name: add
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_sub,
+    display_name: sub
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_mul,
+    display_name: mul
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_left_shift,
+    display_name: left_shift
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_right_shift,
+    display_name: right_shift
+);
+
+define_server_key_scalar_div_bench_fn!(
+    method_name: scalar_div,
+    display_name: div
+);
+define_server_key_scalar_div_bench_fn!(
+    method_name: scalar_mod,
+    display_name: modulo
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_greater,
+    display_name: greater_than
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_greater_or_equal,
+    display_name: greater_or_equal
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_less,
+    display_name: less_than
+);
+define_server_key_scalar_bench_fn!(
+    method_name: scalar_less_or_equal,
+    display_name: less_or_equal
+);
+define_server_key_scalar_div_bench_fn!(
+    method_name: scalar_equal,
+    display_name: equal
+);
+define_server_key_scalar_div_bench_fn!(
+    method_name: scalar_not_equal,
+    display_name: not_equal
+);
+
+define_custom_bench_fn!(function_name: carry_extract);
+
+define_custom_bench_fn!(
+    function_name: programmable_bootstrapping
+);
+
+criterion_group!(
+    smart_ops,
+    smart_bitand,
+    smart_bitor,
+    smart_bitxor,
+    smart_add,
+    smart_sub,
+    smart_mul_lsb
+);
+
+criterion_group!(
+    unchecked_ops,
+    unchecked_neg,
+    unchecked_add,
+    unchecked_sub,
+    unchecked_mul_lsb,
+    unchecked_mul_msb,
+    unchecked_div,
+    unchecked_greater,
+    unchecked_less,
+    unchecked_equal,
+    carry_extract,
+    programmable_bootstrapping
+);
+
+criterion_group!(
+    unchecked_scalar_ops,
+    unchecked_scalar_add,
+    unchecked_scalar_mul,
+    unchecked_scalar_sub,
+    unchecked_scalar_div,
+    unchecked_scalar_mod,
+    unchecked_scalar_left_shift,
+    unchecked_scalar_right_shift
+);
+
+criterion_group!(
+    default_ops,
+    neg,
+    bitand,
+    bitor,
+    bitxor,
+    add,
+    sub,
+    div,
+    mul,
+    greater,
+    greater_or_equal,
+    less,
+    less_or_equal,
+    equal,
+    not_equal
+);
+
+criterion_group!(
+    default_scalar_ops,
+    scalar_add,
+    scalar_sub,
+    scalar_div,
+    scalar_mul,
+    scalar_mod,
+    scalar_left_shift,
+    scalar_right_shift,
+    scalar_greater,
+    scalar_greater_or_equal,
+    scalar_less,
+    scalar_less_or_equal,
+    scalar_equal,
+    scalar_not_equal
+);
+
+criterion_group!(misc, server_key_from_compressed_key);
+
+mod casting;
+criterion_group!(
+    casting,
+    casting::pack_cast_64,
+    casting::pack_cast,
+    casting::cast
+);
+
+fn main() {
+    fn default_bench() {
+        casting();
+        default_ops();
+        default_scalar_ops();
+        misc();
+    }
+
+    match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
+        Ok(val) => {
+            match val.to_lowercase().as_str() {
+                "default" => default_bench(),
+                "smart" => smart_ops(),
+                "unchecked" => {
+                    unchecked_ops();
+                    unchecked_scalar_ops();
+                }
+                _ => panic!("unknown benchmark operations flavor"),
+            };
+        }
+        Err(_) => default_bench(),
+    };
+
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/benches/shortint/casting.rs
+++ b/tfhe-benchmark/benches/shortint/casting.rs
@@ -0,0 +1,136 @@
+use benchmark::params_aliases::*;
+use benchmark::utilities::{write_to_json, OperatorType};
+use criterion::Criterion;
+use rayon::prelude::*;
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::prelude::*;
+
+pub fn pack_cast_64(c: &mut Criterion) {
+    let bench_name = "shortint::pack_cast_64";
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    let (client_key_1, server_key_1): (ClientKey, ServerKey) =
+        gen_keys(BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key_2, server_key_2): (ClientKey, ServerKey) =
+        gen_keys(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128);
+
+    let ks_param = BENCH_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128;
+    let ks_param_name = ks_param.name();
+
+    let ksk = KeySwitchingKey::new(
+        (&client_key_1, Some(&server_key_1)),
+        (&client_key_2, &server_key_2),
+        ks_param,
+    );
+
+    let vec_ct = vec![client_key_1.encrypt(1); 64];
+
+    let bench_id = format!("{bench_name}_{ks_param_name}");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let _ = (0..32)
+                .into_par_iter()
+                .map(|i| {
+                    let byte_idx = 7 - i / 4;
+                    let pair_idx = i % 4;
+
+                    let b0 = &vec_ct[8 * byte_idx + 2 * pair_idx];
+                    let b1 = &vec_ct[8 * byte_idx + 2 * pair_idx + 1];
+
+                    ksk.cast(
+                        &server_key_1.unchecked_add(b0, &server_key_1.unchecked_scalar_mul(b1, 2)),
+                    )
+                })
+                .collect::<Vec<_>>();
+        });
+    });
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        ks_param,
+        ks_param_name,
+        "pack_cast_64",
+        &OperatorType::Atomic,
+        0,
+        vec![],
+    );
+}
+
+pub fn pack_cast(c: &mut Criterion) {
+    let bench_name = "shortint::pack_cast";
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    let (client_key_1, server_key_1): (ClientKey, ServerKey) =
+        gen_keys(BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key_2, server_key_2): (ClientKey, ServerKey) =
+        gen_keys(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128);
+
+    let ks_param = BENCH_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128;
+    let ks_param_name = "BENCH_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128";
+
+    let ksk = KeySwitchingKey::new(
+        (&client_key_1, Some(&server_key_1)),
+        (&client_key_2, &server_key_2),
+        ks_param,
+    );
+
+    let ct_1 = client_key_1.encrypt(1);
+    let ct_2 = client_key_1.encrypt(1);
+
+    let bench_id = format!("{bench_name}_{ks_param_name}");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let _ = ksk.cast(
+                &server_key_1.unchecked_add(&ct_1, &server_key_1.unchecked_scalar_mul(&ct_2, 2)),
+            );
+        });
+    });
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        ks_param,
+        ks_param_name,
+        "pack_cast",
+        &OperatorType::Atomic,
+        0,
+        vec![],
+    );
+}
+
+pub fn cast(c: &mut Criterion) {
+    let bench_name = "shortint::cast";
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    let (client_key_1, server_key_1): (ClientKey, ServerKey) =
+        gen_keys(BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key_2, server_key_2): (ClientKey, ServerKey) =
+        gen_keys(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128);
+
+    let ks_param = BENCH_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128;
+    let ks_param_name = "BENCH_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128";
+
+    let ksk = KeySwitchingKey::new(
+        (&client_key_1, Some(&server_key_1)),
+        (&client_key_2, &server_key_2),
+        ks_param,
+    );
+
+    let ct = client_key_1.encrypt(1);
+
+    let bench_id = format!("{bench_name}_{ks_param_name}");
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let _ = ksk.cast(&ct);
+        });
+    });
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        ks_param,
+        ks_param_name,
+        "cast",
+        &OperatorType::Atomic,
+        0,
+        vec![],
+    );
+}
--- a/tfhe-benchmark/benches/shortint/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/shortint/glwe_packing_compression.rs
@@ -0,0 +1,82 @@
+use benchmark::params_aliases::*;
+use criterion::{black_box, criterion_group, Criterion};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+use tfhe::shortint::prelude::*;
+
+fn glwe_packing(c: &mut Criterion) {
+    let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let comp_param = BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let number_to_pack = 256;
+
+    let bench_name = "shortint_packing_compression";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    // Generate the client key and the server key:
+    let cks = ClientKey::new(param);
+
+    let private_compression_key = cks.new_compression_private_key(comp_param);
+
+    let (compression_key, decompression_key) =
+        cks.new_compression_decompression_keys(&private_compression_key);
+
+    let ct: Vec<_> = (0..number_to_pack).map(|_| cks.encrypt(0)).collect();
+
+    bench_group.bench_function("pack".to_owned(), |b| {
+        b.iter(|| {
+            let packed = compression_key.compress_ciphertexts_into_list(&ct);
+
+            _ = black_box(packed);
+        })
+    });
+
+    let packed = compression_key.compress_ciphertexts_into_list(&ct);
+    bench_group.bench_function("unpack_all".to_owned(), |b| {
+        b.iter(|| {
+            (0..number_to_pack).into_par_iter().for_each(|i| {
+                let unpacked = decompression_key.unpack(&packed, i);
+
+                _ = black_box(unpacked);
+            });
+        })
+    });
+
+    bench_group.bench_function("unpack_one_lwe".to_owned(), |b| {
+        b.iter(|| {
+            let unpacked = decompression_key.unpack(&packed, 0);
+
+            _ = black_box(unpacked);
+        })
+    });
+
+    bench_group.bench_function("unpack_64b".to_owned(), |b| {
+        b.iter(|| {
+            (0..32).into_par_iter().for_each(|i| {
+                let unpacked = decompression_key.unpack(&packed, i);
+
+                _ = black_box(unpacked);
+            });
+        })
+    });
+
+    bench_group.bench_function("pack_unpack".to_owned(), |b| {
+        b.iter(|| {
+            let packed = compression_key.compress_ciphertexts_into_list(&ct);
+
+            (0..number_to_pack).into_par_iter().for_each(|i| {
+                let unpacked = decompression_key.unpack(&packed, i);
+
+                _ = black_box(unpacked);
+            });
+        })
+    });
+}
+
+criterion_group!(glwe_packing2, glwe_packing);
+
+fn main() {
+    glwe_packing2();
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/benches/shortint/oprf.rs
+++ b/tfhe-benchmark/benches/shortint/oprf.rs
@@ -0,0 +1,29 @@
+use benchmark::params_aliases::*;
+use criterion::{black_box, criterion_group, Criterion};
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::keycache::KEY_CACHE;
+use tfhe_csprng::seeders::Seed;
+
+fn oprf(c: &mut Criterion) {
+    let bench_name = "shortint-oprf";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+
+    let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+
+    let keys = KEY_CACHE.get_from_param(param);
+    let sks = keys.server_key();
+
+    bench_group.bench_function(format!("2-bits-oprf::{}", param.name()), |b| {
+        b.iter(|| {
+            _ = black_box(sks.generate_oblivious_pseudo_random(Seed(0), 2));
+        })
+    });
+}
+
+criterion_group!(oprf2, oprf);
+
+fn main() {
+    oprf2();
+    Criterion::default().configure_from_args().final_summary();
+}
--- a/tfhe-benchmark/src/bin/boolean_key_sizes.rs
+++ b/tfhe-benchmark/src/bin/boolean_key_sizes.rs
@@ -0,0 +1,88 @@
+use benchmark::utilities::{write_to_json, OperatorType};
+use std::fs::{File, OpenOptions};
+use std::io::Write;
+use std::path::Path;
+use tfhe::boolean::parameters::{DEFAULT_PARAMETERS, PARAMETERS_ERROR_PROB_2_POW_MINUS_165};
+use tfhe::boolean::{client_key, server_key};
+
+fn write_result(file: &mut File, name: &str, value: usize) {
+    let line = format!("{name},{value}\n");
+    let error_message = format!("cannot write {name} result into file");
+    file.write_all(line.as_bytes()).expect(&error_message);
+}
+
+fn client_server_key_sizes(results_file: &Path) {
+    let boolean_params_vec = [
+        (DEFAULT_PARAMETERS, "DEFAULT_PARAMETERS"),
+        (PARAMETERS_ERROR_PROB_2_POW_MINUS_165, "TFHE_LIB_PARAMETERS"),
+    ];
+    File::create(results_file).expect("create results file failed");
+    let mut file = OpenOptions::new()
+        .append(true)
+        .open(results_file)
+        .expect("cannot open results file");
+
+    let operator = OperatorType::Atomic;
+
+    println!("Generating boolean (ClientKey, ServerKey)");
+    for (i, (params, params_name)) in boolean_params_vec.iter().enumerate() {
+        println!(
+            "Generating [{} / {}] : {}",
+            i + 1,
+            boolean_params_vec.len(),
+            params_name.to_lowercase()
+        );
+
+        let cks = client_key::ClientKey::new(params);
+        let sks = server_key::ServerKey::new(&cks);
+        let ksk_size = sks.key_switching_key_size_bytes();
+        let test_name = format!("boolean_key_sizes_{params_name}_ksk");
+
+        write_result(&mut file, &test_name, ksk_size);
+        write_to_json::<u32, _>(
+            &test_name,
+            *params,
+            *params_name,
+            "KSK",
+            &operator,
+            0,
+            vec![],
+        );
+
+        println!(
+            "Element in KSK: {}, size in bytes: {}",
+            sks.key_switching_key_size_elements(),
+            ksk_size,
+        );
+
+        let bsk_size = sks.bootstrapping_key_size_bytes();
+        let test_name = format!("boolean_key_sizes_{params_name}_bsk");
+
+        write_result(&mut file, &test_name, bsk_size);
+        write_to_json::<u32, _>(
+            &test_name,
+            *params,
+            *params_name,
+            "BSK",
+            &operator,
+            0,
+            vec![],
+        );
+
+        println!(
+            "Element in BSK: {}, size in bytes: {}",
+            sks.bootstrapping_key_size_elements(),
+            bsk_size,
+        );
+    }
+}
+
+fn main() {
+    let work_dir = std::env::current_dir().unwrap();
+    let mut new_work_dir = work_dir;
+    new_work_dir.push("tfhe");
+    std::env::set_current_dir(new_work_dir).unwrap();
+
+    let results_file = Path::new("boolean_key_sizes.csv");
+    client_server_key_sizes(results_file)
+}
--- a/tfhe-benchmark/src/bin/hlapi_compact_pk_ct_sizes.rs
+++ b/tfhe-benchmark/src/bin/hlapi_compact_pk_ct_sizes.rs
@@ -0,0 +1,145 @@
+use benchmark::params_aliases::*;
+use benchmark::utilities::{write_to_json, OperatorType};
+use rand::Rng;
+use std::fs::{File, OpenOptions};
+use std::io::Write;
+use std::path::Path;
+use tfhe::integer::U256;
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::PBSParameters;
+use tfhe::{generate_keys, CompactCiphertextList, CompactPublicKey, ConfigBuilder};
+
+fn write_result(file: &mut File, name: &str, value: usize) {
+    let line = format!("{name},{value}\n");
+    let error_message = format!("cannot write {name} result into file");
+    file.write_all(line.as_bytes()).expect(&error_message);
+}
+
+pub fn cpk_and_cctl_sizes(results_file: &Path) {
+    const NB_CTXT: usize = 5;
+
+    let mut rng = rand::thread_rng();
+
+    File::create(results_file).expect("create results file failed");
+    let mut file = OpenOptions::new()
+        .create(true)
+        .truncate(true)
+        .write(true)
+        .open(results_file)
+        .expect("cannot open results file");
+
+    let operator = OperatorType::Atomic;
+
+    {
+        let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let config = ConfigBuilder::default()
+            .use_custom_parameters(params)
+            .use_dedicated_compact_public_key_parameters((
+                BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            ))
+            .build();
+        let (client_key, _) = generate_keys(config);
+        let test_name = format!("hlapi_sizes_{}_cpk", params.name());
+
+        let params: PBSParameters = params.into();
+
+        println!("Sizes for: {} and 32 bits", params.name());
+
+        let public_key = CompactPublicKey::new(&client_key);
+
+        let cpk_size = bincode::serialize(&public_key).unwrap().len();
+
+        println!("PK size: {cpk_size} bytes");
+        write_result(&mut file, &test_name, cpk_size);
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "CPK",
+            &operator,
+            0,
+            vec![],
+        );
+
+        let test_name = format!("hlapi_sizes_{}_cctl_{NB_CTXT}_len_32_bits", params.name());
+
+        let vec_inputs: Vec<_> = (0..NB_CTXT).map(|_| rng.gen::<u32>()).collect();
+
+        let encrypted_inputs = CompactCiphertextList::builder(&public_key)
+            .extend(vec_inputs.iter().copied())
+            .build();
+        let cctl_size = bincode::serialize(&encrypted_inputs).unwrap().len();
+
+        println!("Compact CT list for {NB_CTXT} CTs: {cctl_size} bytes");
+
+        write_result(&mut file, &test_name, cctl_size);
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "CCTL",
+            &operator,
+            0,
+            vec![],
+        );
+    }
+
+    // 256 bits
+    {
+        let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let config = ConfigBuilder::default()
+            .use_custom_parameters(params)
+            .use_dedicated_compact_public_key_parameters((
+                BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            ))
+            .build();
+        let (client_key, _) = generate_keys(config);
+
+        let params: PBSParameters = params.into();
+
+        println!("Sizes for: {} and 256 bits", params.name());
+
+        let public_key = CompactPublicKey::new(&client_key);
+
+        println!(
+            "PK size: {} bytes",
+            bincode::serialize(&public_key).unwrap().len()
+        );
+
+        let test_name = format!("hlapi_sizes_{}_cctl_{NB_CTXT}_len_256_bits", params.name());
+
+        let vec_inputs: Vec<_> = (0..NB_CTXT).map(|_| U256::from(rng.gen::<u32>())).collect();
+
+        let encrypted_inputs = CompactCiphertextList::builder(&public_key)
+            .extend(vec_inputs.iter().copied())
+            .build();
+        let cctl_size = bincode::serialize(&encrypted_inputs).unwrap().len();
+
+        println!("Compact CT list for {NB_CTXT} CTs: {cctl_size} bytes");
+
+        write_result(&mut file, &test_name, cctl_size);
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "CCTL",
+            &operator,
+            0,
+            vec![],
+        );
+    }
+}
+
+fn main() {
+    let work_dir = std::env::current_dir().unwrap();
+    println!("work_dir: {}", std::env::current_dir().unwrap().display());
+    // Change workdir so that the location of the keycache matches the one for tests
+    let mut new_work_dir = work_dir;
+    new_work_dir.push("tfhe");
+    std::env::set_current_dir(new_work_dir).unwrap();
+
+    let results_file = Path::new("hlapi_cpk_and_cctl_sizes.csv");
+    cpk_and_cctl_sizes(results_file)
+}
--- a/tfhe-benchmark/src/bin/shortint_key_sizes.rs
+++ b/tfhe-benchmark/src/bin/shortint_key_sizes.rs
@@ -0,0 +1,291 @@
+use benchmark::params_aliases::*;
+use benchmark::utilities::{write_to_json, CryptoParametersRecord, OperatorType};
+use std::fs::{File, OpenOptions};
+use std::io::Write;
+use std::path::Path;
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::atomic_pattern::compressed::CompressedAtomicPatternServerKey;
+use tfhe::shortint::keycache::KEY_CACHE;
+use tfhe::shortint::server_key::{StandardServerKey, StandardServerKeyView};
+use tfhe::shortint::{
+    ClassicPBSParameters, ClientKey, CompactPrivateKey, CompressedCompactPublicKey,
+    CompressedKeySwitchingKey, CompressedServerKey, PBSParameters,
+};
+
+fn write_result(file: &mut File, name: &str, value: usize) {
+    let line = format!("{name},{value}\n");
+    let error_message = format!("cannot write {name} result into file");
+    file.write_all(line.as_bytes()).expect(&error_message);
+}
+
+fn client_server_key_sizes(results_file: &Path) {
+    let shortint_params_vec: Vec<PBSParameters> = vec![
+        BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+        BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128.into(),
+    ];
+    File::create(results_file).expect("create results file failed");
+    let mut file = OpenOptions::new()
+        .append(true)
+        .open(results_file)
+        .expect("cannot open results file");
+
+    let operator = OperatorType::Atomic;
+
+    println!("Generating shortint (ClientKey, ServerKey)");
+    for (i, params) in shortint_params_vec.iter().copied().enumerate() {
+        println!(
+            "Generating [{} / {}] : {}",
+            i + 1,
+            shortint_params_vec.len(),
+            params.name().to_lowercase()
+        );
+
+        let keys = KEY_CACHE.get_from_param(params);
+
+        let cks = keys.client_key();
+        let sks = StandardServerKeyView::try_from(keys.server_key().as_view()).unwrap();
+        let ksk_size = sks.key_switching_key_size_bytes();
+        let test_name = format!("shortint_key_sizes_{}_ksk", params.name());
+
+        write_result(&mut file, &test_name, ksk_size);
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "KSK",
+            &operator,
+            0,
+            vec![],
+        );
+
+        println!(
+            "Element in KSK: {}, size in bytes: {}",
+            sks.key_switching_key_size_elements(),
+            ksk_size,
+        );
+
+        let bsk_size = sks.bootstrapping_key_size_bytes();
+        let test_name = format!("shortint_key_sizes_{}_bsk", params.name());
+
+        write_result(&mut file, &test_name, bsk_size);
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "BSK",
+            &operator,
+            0,
+            vec![],
+        );
+
+        println!(
+            "Element in BSK: {}, size in bytes: {}",
+            sks.bootstrapping_key_size_elements(),
+            bsk_size,
+        );
+
+        let sks_compressed = CompressedServerKey::new(cks);
+        let bsk_compressed_size = sks_compressed.bootstrapping_key_size_bytes();
+        let test_name = format!("shortint_key_sizes_{}_bsk_compressed", params.name());
+
+        write_result(&mut file, &test_name, bsk_compressed_size);
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "BSK",
+            &operator,
+            0,
+            vec![],
+        );
+
+        println!(
+            "Element in BSK compressed: {}, size in bytes: {}",
+            sks_compressed.bootstrapping_key_size_elements(),
+            bsk_compressed_size,
+        );
+
+        // Clear keys as we go to avoid filling the RAM
+        KEY_CACHE.clear_in_memory_cache()
+    }
+}
+
+fn measure_serialized_size<T: serde::Serialize, P: Into<CryptoParametersRecord<u64>> + Clone>(
+    to_serialize: &T,
+    param: P,
+    param_name: &str,
+    test_name_suffix: &str,
+    display_name: &str,
+    file: &mut File,
+) {
+    let serialized = bincode::serialize(to_serialize).unwrap();
+    let size = serialized.len();
+    let test_name = format!("shortint_key_sizes_{param_name}_{test_name_suffix}");
+    write_result(file, &test_name, size);
+    write_to_json::<u64, _>(
+        &test_name,
+        param.clone(),
+        param_name,
+        display_name,
+        &OperatorType::Atomic,
+        0,
+        vec![],
+    );
+
+    println!("{test_name_suffix} {param_name} -> size: {size} bytes",);
+}
+
+fn tuniform_key_set_sizes(results_file: &Path) {
+    File::create(results_file).expect("create results file failed");
+    let mut file = OpenOptions::new()
+        .append(true)
+        .open(results_file)
+        .expect("cannot open results file");
+
+    println!("Measuring shortint key sizes:");
+
+    let param_fhe = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let param_fhe_name = param_fhe.name();
+    let cks = ClientKey::new(param_fhe);
+    let compressed_sks = CompressedServerKey::new(&cks);
+    let sks = StandardServerKey::try_from(compressed_sks.decompress()).unwrap();
+
+    let std_compressed_ap_key = match &compressed_sks.compressed_ap_server_key {
+        CompressedAtomicPatternServerKey::Standard(
+            compressed_standard_atomic_pattern_server_key,
+        ) => compressed_standard_atomic_pattern_server_key,
+        CompressedAtomicPatternServerKey::KeySwitch32(_) => {
+            panic!("KS32 is unsupported to measure key sizes at the moment")
+        }
+    };
+
+    measure_serialized_size(
+        &sks.atomic_pattern.key_switching_key,
+        <ClassicPBSParameters as Into<PBSParameters>>::into(param_fhe),
+        &param_fhe_name,
+        "ksk",
+        "KSK",
+        &mut file,
+    );
+    measure_serialized_size(
+        std_compressed_ap_key.key_switching_key(),
+        <ClassicPBSParameters as Into<PBSParameters>>::into(param_fhe),
+        &param_fhe_name,
+        "ksk_compressed",
+        "KSK",
+        &mut file,
+    );
+
+    measure_serialized_size(
+        &sks.atomic_pattern.bootstrapping_key,
+        <ClassicPBSParameters as Into<PBSParameters>>::into(param_fhe),
+        &param_fhe_name,
+        "bsk",
+        "BSK",
+        &mut file,
+    );
+    measure_serialized_size(
+        &std_compressed_ap_key.bootstrapping_key(),
+        <ClassicPBSParameters as Into<PBSParameters>>::into(param_fhe),
+        &param_fhe_name,
+        "bsk_compressed",
+        "BSK",
+        &mut file,
+    );
+
+    let param_pke = BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let param_pke_name = param_pke.name();
+    let compact_private_key = CompactPrivateKey::new(param_pke);
+    let compressed_pk = CompressedCompactPublicKey::new(&compact_private_key);
+    let pk = compressed_pk.decompress();
+
+    measure_serialized_size(&pk, param_pke, &param_pke_name, "cpk", "CPK", &mut file);
+    measure_serialized_size(
+        &compressed_pk,
+        param_pke,
+        &param_pke_name,
+        "cpk_compressed",
+        "CPK",
+        &mut file,
+    );
+
+    let param_compression = BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let param_compression_name = param_compression.name();
+    let params_tuple = (
+        param_compression,
+        BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+    );
+
+    let private_compression_key = cks.new_compression_private_key(param_compression);
+    let (compression_key, decompression_key) =
+        cks.new_compression_decompression_keys(&private_compression_key);
+
+    measure_serialized_size(
+        &compression_key,
+        params_tuple,
+        &param_compression_name,
+        "compression_key",
+        "CompressionKey",
+        &mut file,
+    );
+    measure_serialized_size(
+        &decompression_key,
+        params_tuple,
+        &param_compression_name,
+        "decompression_key",
+        "CompressionKey",
+        &mut file,
+    );
+
+    let param_casting = BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let param_casting_name = param_casting.name();
+    let compressed_casting_key = CompressedKeySwitchingKey::new(
+        (&compact_private_key, None),
+        (&cks, &compressed_sks),
+        param_casting,
+    );
+    let casting_key = compressed_casting_key.decompress();
+
+    measure_serialized_size(
+        &casting_key.into_raw_parts().0,
+        param_casting,
+        &param_casting_name,
+        "casting_key",
+        "CastKey",
+        &mut file,
+    );
+    measure_serialized_size(
+        &compressed_casting_key.into_raw_parts().0,
+        param_casting,
+        &param_casting_name,
+        "casting_key_compressed",
+        "CastKey",
+        &mut file,
+    );
+}
+
+fn main() {
+    let work_dir = std::env::current_dir().unwrap();
+    println!("work_dir: {}", std::env::current_dir().unwrap().display());
+    // Change workdir so that the location of the keycache matches the one for tests
+    let mut new_work_dir = work_dir;
+    new_work_dir.push("tfhe");
+    std::env::set_current_dir(new_work_dir).unwrap();
+
+    let results_file = Path::new("shortint_key_sizes.csv");
+    client_server_key_sizes(results_file);
+    tuniform_key_set_sizes(results_file);
+}
--- a/tfhe-benchmark/src/bin/wasm_benchmarks_parser.rs
+++ b/tfhe-benchmark/src/bin/wasm_benchmarks_parser.rs
@@ -0,0 +1,87 @@
+use benchmark::utilities::{write_to_json, OperatorType};
+use clap::Parser;
+use std::collections::HashMap;
+use std::fs;
+use std::fs::{File, OpenOptions};
+use std::io::Write;
+use std::path::Path;
+use tfhe::keycache::NamedParam;
+use tfhe::shortint::keycache::get_shortint_parameter_set_from_name;
+use tfhe::shortint::{ClassicPBSParameters, PBSParameters};
+
+const BENCHMARK_NAME_PREFIX: &str = "wasm::";
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    raw_results_file: String,
+}
+
+fn params_from_name(name: &str) -> ClassicPBSParameters {
+    match get_shortint_parameter_set_from_name(name.to_uppercase().as_str())
+        .pbs_parameters()
+        .unwrap()
+    {
+        PBSParameters::PBS(p) => p,
+        PBSParameters::MultiBitPBS(_) => {
+            panic!("Tried to get a MultiBitPBS, expected ClassicPBSParameters")
+        }
+    }
+}
+
+fn write_result(file: &mut File, name: &str, value: usize) {
+    let line = format!("{name},{value}\n");
+    let error_message = format!("cannot write {name} result into file");
+    file.write_all(line.as_bytes()).expect(&error_message);
+}
+
+pub fn parse_wasm_benchmarks(results_file: &Path, raw_results_file: &Path) {
+    File::create(results_file).expect("create results file failed");
+    let mut file = OpenOptions::new()
+        .append(true)
+        .open(results_file)
+        .expect("cannot open parsed results file");
+
+    let operator = OperatorType::Atomic;
+
+    let raw_results = fs::read_to_string(raw_results_file).expect("cannot open raw results file");
+    let results_as_json: HashMap<String, f32> = serde_json::from_str(&raw_results).unwrap();
+
+    for (full_name, val) in results_as_json.iter() {
+        let prefixed_full_name = format!("{BENCHMARK_NAME_PREFIX}{full_name}");
+        let name_parts = full_name.split("_mean_").collect::<Vec<_>>();
+        let bench_name = name_parts[0];
+        let params: PBSParameters = params_from_name(name_parts[1]).into();
+        println!("{name_parts:?}");
+        if bench_name.contains("_size") {
+            write_result(&mut file, &prefixed_full_name, *val as usize);
+        } else {
+            let value_in_ns = (val * 1_000_000_f32) as usize;
+            write_result(&mut file, &prefixed_full_name, value_in_ns);
+        }
+
+        write_to_json::<u64, _>(
+            &prefixed_full_name,
+            params,
+            params.name(),
+            bench_name,
+            &operator,
+            0,
+            vec![],
+        );
+    }
+}
+
+fn main() {
+    let args = Args::parse();
+
+    let work_dir = std::env::current_dir().unwrap();
+    let mut new_work_dir = work_dir;
+    new_work_dir.push("tfhe");
+    std::env::set_current_dir(new_work_dir).unwrap();
+
+    let results_file = Path::new("wasm_pk_gen.csv");
+    let raw_results = Path::new(&args.raw_results_file);
+
+    parse_wasm_benchmarks(results_file, raw_results);
+}
--- a/tfhe-benchmark/src/lib.rs
+++ b/tfhe-benchmark/src/lib.rs
@@ -0,0 +1,3 @@
+pub mod params;
+pub mod params_aliases;
+pub mod utilities;
--- a/tfhe-benchmark/src/params.rs
+++ b/tfhe-benchmark/src/params.rs
@@ -0,0 +1,449 @@
+#[cfg(feature = "boolean")]
+pub mod boolean_params {
+    use crate::utilities::CryptoParametersRecord;
+    use tfhe::boolean::parameters::{
+        DEFAULT_PARAMETERS, DEFAULT_PARAMETERS_KS_PBS, PARAMETERS_ERROR_PROB_2_POW_MINUS_165,
+    };
+
+    pub fn benchmark_32bits_parameters() -> Vec<(String, CryptoParametersRecord<u32>)> {
+        [
+            ("BOOLEAN_DEFAULT_PARAMS", DEFAULT_PARAMETERS),
+            (
+                "BOOLEAN_TFHE_LIB_PARAMS",
+                PARAMETERS_ERROR_PROB_2_POW_MINUS_165,
+            ),
+            ("BOOLEAN_DEFAULT_PARAMS_KS_PBS", DEFAULT_PARAMETERS_KS_PBS),
+        ]
+        .iter()
+        .map(|(name, params)| (name.to_string(), params.to_owned().into()))
+        .collect()
+    }
+}
+
+#[cfg(feature = "boolean")]
+pub use boolean_params::*;
+
+#[cfg(feature = "shortint")]
+pub mod shortint_params {
+    use crate::params_aliases::*;
+    use crate::utilities::CryptoParametersRecord;
+    use std::collections::HashMap;
+    use std::env;
+    use std::sync::OnceLock;
+    use tfhe::core_crypto::prelude::{DynamicDistribution, LweBskGroupingFactor};
+    use tfhe::keycache::NamedParam;
+    use tfhe::shortint::{
+        CarryModulus, ClassicPBSParameters, MessageModulus, MultiBitPBSParameters, PBSParameters,
+    };
+
+    pub const SHORTINT_BENCH_PARAMS_TUNIFORM: [ClassicPBSParameters; 4] = [
+        BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_MESSAGE_4_CARRY_4_KS_PBS_TUNIFORM_2M128,
+    ];
+
+    pub const SHORTINT_BENCH_PARAMS_GAUSSIAN: [ClassicPBSParameters; 4] = [
+        BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M128,
+    ];
+
+    #[cfg(feature = "gpu")]
+    pub const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 6] = [
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128,
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128,
+    ];
+
+    #[cfg(not(feature = "gpu"))]
+    pub const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 6] = [
+        BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+        BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128,
+    ];
+
+    pub fn benchmark_parameters() -> Vec<(String, CryptoParametersRecord<u64>)> {
+        match get_parameters_set() {
+            ParametersSet::Default => SHORTINT_BENCH_PARAMS_TUNIFORM
+                .iter()
+                .chain(SHORTINT_BENCH_PARAMS_GAUSSIAN.iter())
+                .map(|params| {
+                    (
+                        params.name(),
+                        <ClassicPBSParameters as Into<PBSParameters>>::into(*params)
+                            .to_owned()
+                            .into(),
+                    )
+                })
+                .collect(),
+            ParametersSet::All => {
+                filter_parameters(
+                    &BENCH_ALL_CLASSIC_PBS_PARAMETERS,
+                    DesiredNoiseDistribution::Both,
+                    DesiredBackend::Cpu, // No parameters set are specific to GPU in this vector
+                )
+                .into_iter()
+                .map(|(params, name)| {
+                    (
+                        name.to_string(),
+                        <ClassicPBSParameters as Into<PBSParameters>>::into(*params)
+                            .to_owned()
+                            .into(),
+                    )
+                })
+                .collect()
+            }
+        }
+    }
+
+    pub fn multi_bit_benchmark_parameters() -> Vec<(String, CryptoParametersRecord<u64>)> {
+        match get_parameters_set() {
+            ParametersSet::Default => SHORTINT_MULTI_BIT_BENCH_PARAMS
+                .iter()
+                .map(|params| {
+                    (
+                        params.name(),
+                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                            .to_owned()
+                            .into(),
+                    )
+                })
+                .collect(),
+            ParametersSet::All => {
+                let desired_backend = if cfg!(feature = "gpu") {
+                    DesiredBackend::Gpu
+                } else {
+                    DesiredBackend::Cpu
+                };
+                filter_parameters(
+                    &BENCH_ALL_MULTI_BIT_PBS_PARAMETERS,
+                    DesiredNoiseDistribution::Both,
+                    desired_backend,
+                )
+                .into_iter()
+                .map(|(params, name)| {
+                    (
+                        name.to_string(),
+                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                            .to_owned()
+                            .into(),
+                    )
+                })
+                .collect()
+            }
+        }
+    }
+
+    pub fn multi_bit_benchmark_parameters_with_grouping(
+    ) -> Vec<(String, CryptoParametersRecord<u64>, LweBskGroupingFactor)> {
+        match get_parameters_set() {
+            ParametersSet::Default => SHORTINT_MULTI_BIT_BENCH_PARAMS
+                .iter()
+                .map(|params| {
+                    (
+                        params.name(),
+                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                            .to_owned()
+                            .into(),
+                        params.grouping_factor,
+                    )
+                })
+                .collect(),
+            ParametersSet::All => {
+                let desired_backend = if cfg!(feature = "gpu") {
+                    DesiredBackend::Gpu
+                } else {
+                    DesiredBackend::Cpu
+                };
+                filter_parameters(
+                    &BENCH_ALL_MULTI_BIT_PBS_PARAMETERS,
+                    DesiredNoiseDistribution::Both,
+                    desired_backend,
+                )
+                .into_iter()
+                .map(|(params, name)| {
+                    (
+                        name.to_string(),
+                        <MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
+                            .to_owned()
+                            .into(),
+                        params.grouping_factor,
+                    )
+                })
+                .collect()
+            }
+        }
+    }
+
+    pub fn raw_benchmark_parameters() -> Vec<PBSParameters> {
+        let is_multi_bit = match env::var("__TFHE_RS_PARAM_TYPE") {
+            Ok(val) => val.to_lowercase() == "multi_bit",
+            Err(_) => false,
+        };
+
+        if is_multi_bit {
+            SHORTINT_MULTI_BIT_BENCH_PARAMS
+                .iter()
+                .map(|p| (*p).into())
+                .collect()
+        } else {
+            SHORTINT_BENCH_PARAMS_TUNIFORM
+                .iter()
+                .chain(SHORTINT_BENCH_PARAMS_GAUSSIAN.iter())
+                .map(|p| (*p).into())
+                .collect()
+        }
+    }
+
+    pub fn benchmark_compression_parameters() -> Vec<(String, CryptoParametersRecord<u64>)> {
+        vec![(
+            BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.name(),
+            (
+                BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+                BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+            )
+                .into(),
+        )]
+    }
+
+    // This array has been built according to performance benchmarks measuring latency over a
+    // matrix of 4 parameters set, 3 grouping factor and a wide range of threads values.
+    // The values available here as u64 are the optimal number of threads to use for a given triplet
+    // representing one or more parameters set.
+    const MULTI_BIT_THREADS_ARRAY: [((MessageModulus, CarryModulus, LweBskGroupingFactor), u64);
+        12] = [
+        (
+            (MessageModulus(2), CarryModulus(2), LweBskGroupingFactor(2)),
+            5,
+        ),
+        (
+            (MessageModulus(4), CarryModulus(4), LweBskGroupingFactor(2)),
+            5,
+        ),
+        (
+            (MessageModulus(8), CarryModulus(8), LweBskGroupingFactor(2)),
+            5,
+        ),
+        (
+            (
+                MessageModulus(16),
+                CarryModulus(16),
+                LweBskGroupingFactor(2),
+            ),
+            5,
+        ),
+        (
+            (MessageModulus(2), CarryModulus(2), LweBskGroupingFactor(3)),
+            7,
+        ),
+        (
+            (MessageModulus(4), CarryModulus(4), LweBskGroupingFactor(3)),
+            9,
+        ),
+        (
+            (MessageModulus(8), CarryModulus(8), LweBskGroupingFactor(3)),
+            10,
+        ),
+        (
+            (
+                MessageModulus(16),
+                CarryModulus(16),
+                LweBskGroupingFactor(3),
+            ),
+            10,
+        ),
+        (
+            (MessageModulus(2), CarryModulus(2), LweBskGroupingFactor(4)),
+            11,
+        ),
+        (
+            (MessageModulus(4), CarryModulus(4), LweBskGroupingFactor(4)),
+            13,
+        ),
+        (
+            (MessageModulus(8), CarryModulus(8), LweBskGroupingFactor(4)),
+            11,
+        ),
+        (
+            (
+                MessageModulus(16),
+                CarryModulus(16),
+                LweBskGroupingFactor(4),
+            ),
+            11,
+        ),
+    ];
+
+    /// Define the number of threads to use for  parameters doing multithreaded programmable
+    /// bootstrapping.
+    ///
+    /// Parameters must have the same values between message and carry modulus.
+    /// Grouping factor 2, 3 and 4 are the only ones that are supported.
+    pub fn multi_bit_num_threads(
+        message_modulus: u64,
+        carry_modulus: u64,
+        grouping_factor: usize,
+    ) -> Option<u64> {
+        // TODO Implement an interpolation mechanism for X_Y parameters set
+        if message_modulus != carry_modulus || ![2, 3, 4].contains(&(grouping_factor as i32)) {
+            return None;
+        }
+        let thread_map: HashMap<(MessageModulus, CarryModulus, LweBskGroupingFactor), u64> =
+            HashMap::from_iter(MULTI_BIT_THREADS_ARRAY);
+        thread_map
+            .get(&(
+                MessageModulus(message_modulus),
+                CarryModulus(carry_modulus),
+                LweBskGroupingFactor(grouping_factor),
+            ))
+            .copied()
+    }
+
+    pub static PARAMETERS_SET: OnceLock<ParametersSet> = OnceLock::new();
+
+    pub enum ParametersSet {
+        Default,
+        All,
+    }
+
+    impl ParametersSet {
+        pub fn from_env() -> Result<Self, String> {
+            let raw_value = env::var("__TFHE_RS_PARAMS_SET").unwrap_or("default".to_string());
+            match raw_value.to_lowercase().as_str() {
+                "default" => Ok(ParametersSet::Default),
+                "all" => Ok(ParametersSet::All),
+                _ => Err(format!("parameters set '{raw_value}' is not supported")),
+            }
+        }
+    }
+
+    pub fn get_parameters_set() -> &'static ParametersSet {
+        PARAMETERS_SET.get_or_init(|| ParametersSet::from_env().unwrap())
+    }
+
+    #[derive(Clone, Copy, Debug)]
+    pub enum DesiredNoiseDistribution {
+        Gaussian,
+        TUniform,
+        Both,
+    }
+
+    #[derive(Clone, Copy, Debug)]
+    pub enum DesiredBackend {
+        Cpu,
+        Gpu,
+    }
+
+    impl DesiredBackend {
+        fn matches_parameter_name_backend(&self, param_name: &str) -> bool {
+            matches!(
+                (self, param_name.to_lowercase().contains("gpu")),
+                (DesiredBackend::Cpu, false) | (DesiredBackend::Gpu, true)
+            )
+        }
+    }
+
+    pub fn filter_parameters<'a, P: Copy + Into<PBSParameters>>(
+        params: &[(&'a P, &'a str)],
+        desired_noise_distribution: DesiredNoiseDistribution,
+        desired_backend: DesiredBackend,
+    ) -> Vec<(&'a P, &'a str)> {
+        params
+            .iter()
+            .filter_map(|(p, name)| {
+                let temp_param: PBSParameters = (**p).into();
+
+                match (
+                    temp_param.lwe_noise_distribution(),
+                    desired_noise_distribution,
+                ) {
+                    // If it's one of the pairs, we continue the process.
+                    (DynamicDistribution::Gaussian(_), DesiredNoiseDistribution::Gaussian)
+                    | (DynamicDistribution::TUniform(_), DesiredNoiseDistribution::TUniform)
+                    | (_, DesiredNoiseDistribution::Both) => (),
+                    _ => return None,
+                }
+
+                if !desired_backend.matches_parameter_name_backend(name) {
+                    return None;
+                };
+
+                Some((*p, *name))
+            })
+            .collect()
+    }
+}
+
+#[cfg(feature = "shortint")]
+pub use shortint_params::*;
+
+#[cfg(feature = "integer")]
+mod integer_params {
+    use crate::params_aliases::*;
+    use crate::utilities::EnvConfig;
+    use itertools::iproduct;
+    use std::vec::IntoIter;
+    use tfhe::shortint::PBSParameters;
+
+    /// An iterator that yields a succession of combinations
+    /// of parameters and a num_block to achieve a certain bit_size ciphertext
+    /// in radix decomposition
+    pub struct ParamsAndNumBlocksIter {
+        params_and_bit_sizes: itertools::Product<IntoIter<PBSParameters>, IntoIter<usize>>,
+    }
+
+    impl Default for ParamsAndNumBlocksIter {
+        fn default() -> Self {
+            let env_config = EnvConfig::new();
+
+            if env_config.is_multi_bit {
+                #[cfg(feature = "gpu")]
+                let params = vec![
+                    BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
+                        .into(),
+                ];
+                #[cfg(not(feature = "gpu"))]
+                let params = vec![
+                    BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
+                ];
+
+                let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
+                Self {
+                    params_and_bit_sizes,
+                }
+            } else {
+                // FIXME One set of parameter is tested since we want to benchmark only quickest
+                // operations.
+                let params = vec![BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into()];
+
+                let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
+                Self {
+                    params_and_bit_sizes,
+                }
+            }
+        }
+    }
+
+    impl Iterator for ParamsAndNumBlocksIter {
+        type Item = (PBSParameters, usize, usize);
+
+        fn next(&mut self) -> Option<Self::Item> {
+            let (param, bit_size) = self.params_and_bit_sizes.next()?;
+            let num_block =
+                (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
+
+            Some((param, num_block, bit_size))
+        }
+    }
+}
+
+#[cfg(feature = "integer")]
+pub use integer_params::*;
--- a/tfhe-benchmark/src/params_aliases.rs
+++ b/tfhe-benchmark/src/params_aliases.rs
@@ -0,0 +1,142 @@
+#[cfg(any(feature = "shortint", feature = "integer"))]
+pub mod shortint_params_aliases {
+    use tfhe::shortint::parameters::current_params::*;
+    use tfhe::shortint::parameters::{
+        ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
+        MultiBitPBSParameters, NoiseSquashingParameters, ShortintKeySwitchingParameters,
+    };
+
+    // KS PBS Gaussian
+    pub const BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_4_CARRY_4_KS_PBS_GAUSSIAN_2M128;
+
+    // KS PBS TUniform
+    pub const BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_MESSAGE_4_CARRY_4_KS_PBS_TUNIFORM_2M128: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_4_CARRY_4_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS: ClassicPBSParameters =
+        V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    pub const BENCH_ALL_CLASSIC_PBS_PARAMETERS: [(&ClassicPBSParameters, &str); 140] =
+        VEC_ALL_CLASSIC_PBS_PARAMETERS;
+
+    // MultiBit
+    // CPU Gaussian
+    pub const BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_MULTI_BIT_GROUP_2_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_MULTI_BIT_GROUP_2_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_MULTI_BIT_GROUP_3_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128;
+
+    // GPU Gaussian
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M128;
+
+    // GPU TUniform
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_1_CARRY_1_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_3_CARRY_3_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_4_CARRY_4_KS_PBS_TUNIFORM_2M128:
+        MultiBitPBSParameters =
+        V1_2_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_4_CARRY_4_KS_PBS_TUNIFORM_2M128;
+
+    pub const BENCH_ALL_MULTI_BIT_PBS_PARAMETERS: [(&MultiBitPBSParameters, &str); 240] =
+        VEC_ALL_MULTI_BIT_PBS_PARAMETERS;
+
+    // PKE
+    pub const BENCH_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        CompactPublicKeyEncryptionParameters =
+        V1_2_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1:
+        CompactPublicKeyEncryptionParameters =
+        V1_2_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1;
+
+    // KS
+    pub const BENCH_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128;
+    pub const BENCH_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    pub const BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    // ZKV1
+    pub const BENCH_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1:
+        CompactPublicKeyEncryptionParameters =
+        V1_2_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1;
+    pub const BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1;
+    pub const BENCH_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1;
+
+    // ZKV2
+    pub const BENCH_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV2:
+        CompactPublicKeyEncryptionParameters =
+        V1_2_PARAM_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV2;
+    pub const BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV2:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV2;
+    pub const BENCH_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV2:
+        ShortintKeySwitchingParameters =
+        V1_2_PARAM_KEYSWITCH_PKE_TO_SMALL_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV2;
+
+    // Compression
+    pub const BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: CompressionParameters =
+        V1_2_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    pub const BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        CompressionParameters =
+        V1_2_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    // Noise Squashing
+    pub const BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        NoiseSquashingParameters =
+        V1_2_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+}
+
+#[cfg(any(feature = "shortint", feature = "integer"))]
+pub use shortint_params_aliases::*;
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -0,0 +1,650 @@
+use serde::Serialize;
+use std::path::PathBuf;
+use std::sync::OnceLock;
+use std::{env, fs};
+#[cfg(feature = "gpu")]
+use tfhe::core_crypto::gpu::get_number_of_gpus;
+use tfhe::core_crypto::prelude::*;
+
+#[cfg(feature = "boolean")]
+pub mod boolean_utils {
+    use super::*;
+    use tfhe::boolean::parameters::BooleanParameters;
+
+    impl From<BooleanParameters> for CryptoParametersRecord<u32> {
+        fn from(params: BooleanParameters) -> Self {
+            CryptoParametersRecord {
+                lwe_dimension: Some(params.lwe_dimension),
+                glwe_dimension: Some(params.glwe_dimension),
+                polynomial_size: Some(params.polynomial_size),
+                lwe_noise_distribution: Some(params.lwe_noise_distribution),
+                glwe_noise_distribution: Some(params.glwe_noise_distribution),
+                pbs_base_log: Some(params.pbs_base_log),
+                pbs_level: Some(params.pbs_level),
+                ks_base_log: Some(params.ks_base_log),
+                ks_level: Some(params.ks_level),
+                ciphertext_modulus: Some(CiphertextModulus::<u32>::new_native()),
+                ..Default::default()
+            }
+        }
+    }
+}
+
+#[allow(unused_imports)]
+#[cfg(feature = "boolean")]
+pub use boolean_utils::*;
+
+#[cfg(feature = "shortint")]
+pub mod shortint_utils {
+    use super::*;
+    use tfhe::shortint::parameters::compact_public_key_only::CompactPublicKeyEncryptionParameters;
+    use tfhe::shortint::parameters::list_compression::CompressionParameters;
+    use tfhe::shortint::parameters::ShortintKeySwitchingParameters;
+    use tfhe::shortint::{
+        AtomicPatternParameters, ClassicPBSParameters, MultiBitPBSParameters, PBSParameters,
+        ShortintParameterSet,
+    };
+
+    impl From<PBSParameters> for CryptoParametersRecord<u64> {
+        fn from(params: PBSParameters) -> Self {
+            AtomicPatternParameters::from(params).into()
+        }
+    }
+
+    impl From<AtomicPatternParameters> for CryptoParametersRecord<u64> {
+        fn from(params: AtomicPatternParameters) -> Self {
+            CryptoParametersRecord {
+                lwe_dimension: Some(params.lwe_dimension()),
+                glwe_dimension: Some(params.glwe_dimension()),
+                polynomial_size: Some(params.polynomial_size()),
+                lwe_noise_distribution: Some(params.lwe_noise_distribution()),
+                glwe_noise_distribution: Some(params.glwe_noise_distribution()),
+                pbs_base_log: Some(params.pbs_base_log()),
+                pbs_level: Some(params.pbs_level()),
+                ks_base_log: Some(params.ks_base_log()),
+                ks_level: Some(params.ks_level()),
+                message_modulus: Some(params.message_modulus().0),
+                carry_modulus: Some(params.carry_modulus().0),
+                ciphertext_modulus: Some(
+                    params
+                        .ciphertext_modulus()
+                        .try_to()
+                        .expect("failed to convert ciphertext modulus"),
+                ),
+                ..Default::default()
+            }
+        }
+    }
+
+    impl From<ShortintKeySwitchingParameters> for CryptoParametersRecord<u64> {
+        fn from(params: ShortintKeySwitchingParameters) -> Self {
+            CryptoParametersRecord {
+                ks_base_log: Some(params.ks_base_log),
+                ks_level: Some(params.ks_level),
+                ..Default::default()
+            }
+        }
+    }
+
+    impl From<CompactPublicKeyEncryptionParameters> for CryptoParametersRecord<u64> {
+        fn from(params: CompactPublicKeyEncryptionParameters) -> Self {
+            CryptoParametersRecord {
+                message_modulus: Some(params.message_modulus.0),
+                carry_modulus: Some(params.carry_modulus.0),
+                ciphertext_modulus: Some(params.ciphertext_modulus),
+                ..Default::default()
+            }
+        }
+    }
+
+    impl From<(CompressionParameters, ClassicPBSParameters)> for CryptoParametersRecord<u64> {
+        fn from((comp_params, pbs_params): (CompressionParameters, ClassicPBSParameters)) -> Self {
+            (comp_params, PBSParameters::PBS(pbs_params)).into()
+        }
+    }
+
+    impl From<(CompressionParameters, MultiBitPBSParameters)> for CryptoParametersRecord<u64> {
+        fn from(
+            (comp_params, multi_bit_pbs_params): (CompressionParameters, MultiBitPBSParameters),
+        ) -> Self {
+            (
+                comp_params,
+                PBSParameters::MultiBitPBS(multi_bit_pbs_params),
+            )
+                .into()
+        }
+    }
+
+    impl From<(CompressionParameters, PBSParameters)> for CryptoParametersRecord<u64> {
+        fn from((comp_params, pbs_params): (CompressionParameters, PBSParameters)) -> Self {
+            let pbs_params = ShortintParameterSet::new_pbs_param_set(pbs_params);
+            let lwe_dimension = pbs_params.encryption_lwe_dimension();
+            CryptoParametersRecord {
+                lwe_dimension: Some(lwe_dimension),
+                br_level: Some(comp_params.br_level),
+                br_base_log: Some(comp_params.br_base_log),
+                packing_ks_level: Some(comp_params.packing_ks_level),
+                packing_ks_base_log: Some(comp_params.packing_ks_base_log),
+                packing_ks_polynomial_size: Some(comp_params.packing_ks_polynomial_size),
+                packing_ks_glwe_dimension: Some(comp_params.packing_ks_glwe_dimension),
+                lwe_per_glwe: Some(comp_params.lwe_per_glwe),
+                storage_log_modulus: Some(comp_params.storage_log_modulus),
+                lwe_noise_distribution: Some(pbs_params.encryption_noise_distribution()),
+                packing_ks_key_noise_distribution: Some(
+                    comp_params.packing_ks_key_noise_distribution,
+                ),
+                ciphertext_modulus: Some(pbs_params.ciphertext_modulus()),
+                ..Default::default()
+            }
+        }
+    }
+}
+
+#[allow(unused_imports)]
+#[cfg(feature = "shortint")]
+pub use shortint_utils::*;
+
+#[derive(Clone, Copy, Default, Serialize)]
+pub struct CryptoParametersRecord<Scalar: UnsignedInteger> {
+    pub lwe_dimension: Option<LweDimension>,
+    pub glwe_dimension: Option<GlweDimension>,
+    pub packing_ks_glwe_dimension: Option<GlweDimension>,
+    pub polynomial_size: Option<PolynomialSize>,
+    pub packing_ks_polynomial_size: Option<PolynomialSize>,
+    #[serde(serialize_with = "CryptoParametersRecord::serialize_distribution")]
+    pub lwe_noise_distribution: Option<DynamicDistribution<Scalar>>,
+    #[serde(serialize_with = "CryptoParametersRecord::serialize_distribution")]
+    pub glwe_noise_distribution: Option<DynamicDistribution<Scalar>>,
+    #[serde(serialize_with = "CryptoParametersRecord::serialize_distribution")]
+    pub packing_ks_key_noise_distribution: Option<DynamicDistribution<Scalar>>,
+    pub pbs_base_log: Option<DecompositionBaseLog>,
+    pub pbs_level: Option<DecompositionLevelCount>,
+    pub ks_base_log: Option<DecompositionBaseLog>,
+    pub ks_level: Option<DecompositionLevelCount>,
+    pub pfks_level: Option<DecompositionLevelCount>,
+    pub pfks_base_log: Option<DecompositionBaseLog>,
+    pub pfks_std_dev: Option<StandardDev>,
+    pub cbs_level: Option<DecompositionLevelCount>,
+    pub cbs_base_log: Option<DecompositionBaseLog>,
+    pub br_level: Option<DecompositionLevelCount>,
+    pub br_base_log: Option<DecompositionBaseLog>,
+    pub packing_ks_level: Option<DecompositionLevelCount>,
+    pub packing_ks_base_log: Option<DecompositionBaseLog>,
+    pub message_modulus: Option<u64>,
+    pub carry_modulus: Option<u64>,
+    pub ciphertext_modulus: Option<CiphertextModulus<Scalar>>,
+    pub lwe_per_glwe: Option<LweCiphertextCount>,
+    pub storage_log_modulus: Option<CiphertextModulusLog>,
+}
+
+impl<Scalar: UnsignedInteger> CryptoParametersRecord<Scalar> {
+    pub fn noise_distribution_as_string(noise_distribution: DynamicDistribution<Scalar>) -> String {
+        match noise_distribution {
+            DynamicDistribution::Gaussian(g) => format!("Gaussian({}, {})", g.std, g.mean),
+            DynamicDistribution::TUniform(t) => format!("TUniform({})", t.bound_log2()),
+        }
+    }
+
+    pub fn serialize_distribution<S>(
+        noise_distribution: &Option<DynamicDistribution<Scalar>>,
+        serializer: S,
+    ) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        match noise_distribution {
+            Some(d) => serializer.serialize_some(&Self::noise_distribution_as_string(*d)),
+            None => serializer.serialize_none(),
+        }
+    }
+}
+
+#[derive(Serialize)]
+enum PolynomialMultiplication {
+    Fft,
+    // Ntt,
+}
+
+#[derive(Serialize)]
+enum IntegerRepresentation {
+    Radix,
+    // Crt,
+    // Hybrid,
+}
+
+#[derive(Serialize)]
+enum ExecutionType {
+    Sequential,
+    Parallel,
+}
+
+#[derive(Serialize)]
+enum KeySetType {
+    Single,
+    // Multi,
+}
+
+#[derive(Serialize)]
+enum OperandType {
+    CipherText,
+    PlainText,
+}
+
+#[derive(Clone, Serialize)]
+pub enum OperatorType {
+    Atomic,
+    // AtomicPattern,
+}
+
+#[derive(Serialize)]
+struct BenchmarkParametersRecord<Scalar: UnsignedInteger> {
+    display_name: String,
+    crypto_parameters_alias: String,
+    crypto_parameters: CryptoParametersRecord<Scalar>,
+    message_modulus: Option<u64>,
+    carry_modulus: Option<u64>,
+    ciphertext_modulus: usize,
+    bit_size: u32,
+    polynomial_multiplication: PolynomialMultiplication,
+    precision: u32,
+    error_probability: f64,
+    integer_representation: IntegerRepresentation,
+    decomposition_basis: Vec<u32>,
+    pbs_algorithm: Option<String>,
+    execution_type: ExecutionType,
+    key_set_type: KeySetType,
+    operand_type: OperandType,
+    operator_type: OperatorType,
+}
+
+/// Writes benchmarks parameters to disk in JSON format.
+pub fn write_to_json<
+    Scalar: UnsignedInteger + Serialize,
+    T: Into<CryptoParametersRecord<Scalar>>,
+>(
+    bench_id: &str,
+    params: T,
+    params_alias: impl Into<String>,
+    display_name: impl Into<String>,
+    operator_type: &OperatorType,
+    bit_size: u32,
+    decomposition_basis: Vec<u32>,
+) {
+    let params = params.into();
+
+    let execution_type = match bench_id.contains("parallelized") {
+        true => ExecutionType::Parallel,
+        false => ExecutionType::Sequential,
+    };
+    let operand_type = match bench_id.contains("scalar") {
+        true => OperandType::PlainText,
+        false => OperandType::CipherText,
+    };
+
+    let record = BenchmarkParametersRecord {
+        display_name: display_name.into(),
+        crypto_parameters_alias: params_alias.into(),
+        crypto_parameters: params.to_owned(),
+        message_modulus: params.message_modulus,
+        carry_modulus: params.carry_modulus,
+        ciphertext_modulus: 64,
+        bit_size,
+        polynomial_multiplication: PolynomialMultiplication::Fft,
+        precision: (params.message_modulus.unwrap_or(2) as u32).ilog2(),
+        error_probability: 2f64.powf(-41.0),
+        integer_representation: IntegerRepresentation::Radix,
+        decomposition_basis,
+        pbs_algorithm: None, // To be added in future version
+        execution_type,
+        key_set_type: KeySetType::Single,
+        operand_type,
+        operator_type: operator_type.to_owned(),
+    };
+
+    let mut params_directory = ["benchmarks_parameters", bench_id]
+        .iter()
+        .collect::<PathBuf>();
+    fs::create_dir_all(&params_directory).unwrap();
+    params_directory.push("parameters.json");
+
+    fs::write(params_directory, serde_json::to_string(&record).unwrap()).unwrap();
+}
+
+const FAST_BENCH_BIT_SIZES: [usize; 1] = [64];
+const BENCH_BIT_SIZES: [usize; 8] = [4, 8, 16, 32, 40, 64, 128, 256];
+const MULTI_BIT_CPU_SIZES: [usize; 6] = [4, 8, 16, 32, 40, 64];
+
+/// User configuration in which benchmarks must be run.
+#[derive(Default)]
+pub struct EnvConfig {
+    pub is_multi_bit: bool,
+    pub is_fast_bench: bool,
+}
+
+impl EnvConfig {
+    pub fn new() -> Self {
+        let is_multi_bit = match env::var("__TFHE_RS_PARAM_TYPE") {
+            Ok(val) => val.to_lowercase() == "multi_bit",
+            Err(_) => false,
+        };
+
+        let is_fast_bench = match env::var("__TFHE_RS_FAST_BENCH") {
+            Ok(val) => val.to_lowercase() == "true",
+            Err(_) => false,
+        };
+
+        EnvConfig {
+            is_multi_bit,
+            is_fast_bench,
+        }
+    }
+
+    /// Get precisions values to benchmark.
+    pub fn bit_sizes(&self) -> Vec<usize> {
+        if self.is_fast_bench {
+            FAST_BENCH_BIT_SIZES.to_vec()
+        } else if self.is_multi_bit {
+            if cfg!(feature = "gpu") {
+                BENCH_BIT_SIZES.to_vec()
+            } else {
+                MULTI_BIT_CPU_SIZES.to_vec()
+            }
+        } else {
+            BENCH_BIT_SIZES.to_vec()
+        }
+    }
+}
+
+pub static BENCH_TYPE: OnceLock<BenchmarkType> = OnceLock::new();
+
+pub enum BenchmarkType {
+    Latency,
+    Throughput,
+}
+
+impl BenchmarkType {
+    pub fn from_env() -> Result<Self, String> {
+        let raw_value = env::var("__TFHE_RS_BENCH_TYPE").unwrap_or("latency".to_string());
+        match raw_value.to_lowercase().as_str() {
+            "latency" => Ok(BenchmarkType::Latency),
+            "throughput" => Ok(BenchmarkType::Throughput),
+            _ => Err(format!("benchmark type '{raw_value}' is not supported")),
+        }
+    }
+}
+
+pub fn get_bench_type() -> &'static BenchmarkType {
+    BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap())
+}
+
+/// Number of streaming multiprocessors (SM) available on Nvidia H100 GPU
+#[cfg(feature = "gpu")]
+const H100_PCIE_SM_COUNT: u32 = 114;
+
+/// Generate a number of threads to use to saturate current machine for throughput measurements.
+pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
+    let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
+    let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0);
+    // Some operations with a high serial workload (e.g. division) would yield an operation
+    // loading value so low that the number of elements in the end wouldn't be meaningful.
+    let minimum_loading = if num_block < 64 { 0.2 } else { 0.01 };
+
+    #[cfg(feature = "gpu")]
+    {
+        let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
+        let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
+        let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
+        elements.min(1500) // This threshold is useful for operation with both a small number of
+                           // block and low PBs count.
+    }
+    #[cfg(not(feature = "gpu"))]
+    {
+        let num_threads = rayon::current_num_threads() as f64;
+        let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);
+        // Add 20% more to maximum threads available.
+        ((num_threads + (num_threads * 0.2)) * block_multiplicator.min(1.0) * operation_loading)
+            as u64
+    }
+}
+
+#[cfg(feature = "gpu")]
+mod cuda_utils {
+    use tfhe::core_crypto::entities::{
+        LweBootstrapKeyOwned, LweKeyswitchKeyOwned, LweMultiBitBootstrapKeyOwned,
+        LwePackingKeyswitchKeyOwned,
+    };
+    use tfhe::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+    use tfhe::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
+    use tfhe::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
+    use tfhe::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+    use tfhe::core_crypto::gpu::vec::CudaVec;
+    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
+    use tfhe::core_crypto::prelude::{Numeric, UnsignedInteger};
+    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
+    use tfhe::{set_server_key, ClientKey, CompressedServerKey, GpuIndex};
+
+    pub const GPU_MAX_SUPPORTED_POLYNOMIAL_SIZE: usize = 16384;
+
+    /// Get vector of CUDA streams that can be directly used for throughput benchmarks in
+    /// core_crypto layer.
+    pub fn cuda_local_streams_core() -> Vec<CudaStreams> {
+        (0..get_number_of_gpus())
+            .map(|i| CudaStreams::new_single_gpu(GpuIndex::new(i)))
+            .collect::<Vec<_>>()
+    }
+
+    /// Computing keys in their CPU flavor.
+    pub struct CpuKeys<T: UnsignedInteger> {
+        ksk: Option<LweKeyswitchKeyOwned<T>>,
+        pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
+        bsk: Option<LweBootstrapKeyOwned<T>>,
+        multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
+    }
+
+    impl<T: UnsignedInteger> CpuKeys<T> {
+        pub fn builder() -> CpuKeysBuilder<T> {
+            CpuKeysBuilder::new()
+        }
+    }
+
+    pub struct CpuKeysBuilder<T: UnsignedInteger> {
+        ksk: Option<LweKeyswitchKeyOwned<T>>,
+        pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
+        bsk: Option<LweBootstrapKeyOwned<T>>,
+        multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
+    }
+
+    impl<T: UnsignedInteger> CpuKeysBuilder<T> {
+        pub fn new() -> CpuKeysBuilder<T> {
+            Self {
+                ksk: None,
+                pksk: None,
+                bsk: None,
+                multi_bit_bsk: None,
+            }
+        }
+
+        pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<T>) -> CpuKeysBuilder<T> {
+            self.ksk = Some(ksk);
+            self
+        }
+
+        pub fn packing_keyswitch_key(
+            mut self,
+            pksk: LwePackingKeyswitchKeyOwned<T>,
+        ) -> CpuKeysBuilder<T> {
+            self.pksk = Some(pksk);
+            self
+        }
+
+        pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T> {
+            self.bsk = Some(bsk);
+            self
+        }
+
+        pub fn multi_bit_bootstrap_key(
+            mut self,
+            mb_bsk: LweMultiBitBootstrapKeyOwned<T>,
+        ) -> CpuKeysBuilder<T> {
+            self.multi_bit_bsk = Some(mb_bsk);
+            self
+        }
+
+        pub fn build(self) -> CpuKeys<T> {
+            CpuKeys {
+                ksk: self.ksk,
+                pksk: self.pksk,
+                bsk: self.bsk,
+                multi_bit_bsk: self.multi_bit_bsk,
+            }
+        }
+    }
+    impl<T: UnsignedInteger> Default for CpuKeysBuilder<T> {
+        fn default() -> Self {
+            Self::new()
+        }
+    }
+
+    /// Computing keys in their Cuda flavor.
+    #[allow(dead_code)]
+    pub struct CudaLocalKeys<T: UnsignedInteger> {
+        pub ksk: Option<CudaLweKeyswitchKey<T>>,
+        pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
+        pub bsk: Option<CudaLweBootstrapKey>,
+        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey>,
+    }
+
+    #[allow(dead_code)]
+    impl<T: UnsignedInteger> CudaLocalKeys<T> {
+        pub fn from_cpu_keys(
+            cpu_keys: &CpuKeys<T>,
+            ms_noise_reduction_key: Option<&ModulusSwitchNoiseReductionKey<u64>>,
+            stream: &CudaStreams,
+        ) -> Self {
+            Self {
+                ksk: cpu_keys
+                    .ksk
+                    .as_ref()
+                    .map(|ksk| CudaLweKeyswitchKey::from_lwe_keyswitch_key(ksk, stream)),
+                pksk: cpu_keys.pksk.as_ref().map(|pksk| {
+                    CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(pksk, stream)
+                }),
+                bsk: cpu_keys.bsk.as_ref().map(|bsk| {
+                    CudaLweBootstrapKey::from_lwe_bootstrap_key(bsk, ms_noise_reduction_key, stream)
+                }),
+                multi_bit_bsk: cpu_keys.multi_bit_bsk.as_ref().map(|mb_bsk| {
+                    CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(mb_bsk, stream)
+                }),
+            }
+        }
+    }
+
+    /// Instantiate Cuda computing keys to each available GPU.
+    pub fn cuda_local_keys_core<T: UnsignedInteger>(
+        cpu_keys: &CpuKeys<T>,
+        ms_noise_reduction_key: Option<&ModulusSwitchNoiseReductionKey<u64>>,
+    ) -> Vec<CudaLocalKeys<T>> {
+        let gpu_count = get_number_of_gpus() as usize;
+        let mut gpu_keys_vec = Vec::with_capacity(gpu_count);
+        for i in 0..gpu_count {
+            let stream = CudaStreams::new_single_gpu(GpuIndex::new(i as u32));
+            gpu_keys_vec.push(CudaLocalKeys::from_cpu_keys(
+                cpu_keys,
+                ms_noise_reduction_key,
+                &stream,
+            ));
+        }
+        gpu_keys_vec
+    }
+
+    pub struct CudaIndexes<T: Numeric> {
+        pub d_input: CudaVec<T>,
+        pub d_output: CudaVec<T>,
+        pub d_lut: CudaVec<T>,
+    }
+
+    impl<T: Numeric> CudaIndexes<T> {
+        pub fn new(indexes: &[T], stream: &CudaStreams, stream_index: u32) -> Self {
+            let length = indexes.len();
+            let mut d_input = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
+            let mut d_output = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
+            let mut d_lut = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
+            unsafe {
+                d_input.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
+                d_output.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
+                d_lut.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
+            }
+            stream.synchronize();
+
+            Self {
+                d_input,
+                d_output,
+                d_lut,
+            }
+        }
+    }
+
+    #[cfg(feature = "integer")]
+    pub mod cuda_integer_utils {
+        use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
+        use tfhe::integer::gpu::CudaServerKey;
+        use tfhe::integer::ClientKey;
+        use tfhe::GpuIndex;
+
+        /// Get number of streams usable for CUDA throughput benchmarks
+        fn cuda_num_streams(num_block: usize) -> u64 {
+            let num_streams_per_gpu: u32 = match num_block {
+                2 => 64,
+                4 => 32,
+                8 => 16,
+                16 => 8,
+                32 => 4,
+                64 => 2,
+                128 => 1,
+                _ => 8,
+            };
+            (num_streams_per_gpu * get_number_of_gpus()) as u64
+        }
+
+        /// Get vector of CUDA streams that can be directly used for throughput benchmarks.
+        pub fn cuda_local_streams(
+            num_block: usize,
+            throughput_elements: usize,
+        ) -> Vec<CudaStreams> {
+            (0..cuda_num_streams(num_block))
+                .map(|i| {
+                    CudaStreams::new_single_gpu(GpuIndex::new(
+                        (i % get_number_of_gpus() as u64) as u32,
+                    ))
+                })
+                .cycle()
+                .take(throughput_elements)
+                .collect::<Vec<_>>()
+        }
+
+        /// Instantiate Cuda server key to each available GPU.
+        pub fn cuda_local_keys(cks: &ClientKey) -> Vec<CudaServerKey> {
+            let gpu_count = get_number_of_gpus() as usize;
+            let mut gpu_sks_vec = Vec::with_capacity(gpu_count);
+            for i in 0..gpu_count {
+                let stream = CudaStreams::new_single_gpu(GpuIndex::new(i as u32));
+                gpu_sks_vec.push(CudaServerKey::new(cks, &stream));
+            }
+            gpu_sks_vec
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn configure_gpu(client_key: &ClientKey) {
+        let compressed_sks = CompressedServerKey::new(client_key);
+        let sks = compressed_sks.decompress_to_gpu();
+        rayon::broadcast(|_| set_server_key(sks.clone()));
+        set_server_key(sks);
+    }
+    #[allow(unused_imports)]
+    #[cfg(feature = "integer")]
+    pub use cuda_integer_utils::*;
+}
+
+#[cfg(feature = "gpu")]
+pub use cuda_utils::*;