chore(bench): print bench id before running the benchmark

Done to circumvent criterion limitation regarding automatic truncation of long benchmark ID. Using a println() call we ensure the complete name is displayed before benchmark execution to ease manual parsing and debugging.
2026-01-07 22:04:10 -05:00 · 2025-11-10 17:46:46 +01:00
parent 6d2de330a4
commit ef07963767
23 changed files with 200 additions and 16 deletions
--- a/tfhe-benchmark/benches/boolean/bench.rs
+++ b/tfhe-benchmark/benches/boolean/bench.rs
@@ -51,30 +51,37 @@ fn benches(c: &mut Criterion, params: BooleanParameters, parameter_name: &str) {
    let ct3 = cks.encrypt(true);

    let id = format!("AND::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.and(&ct1, &ct2))));
    write_to_json_boolean(&id, params, parameter_name, "and");

    let id = format!("NAND::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.nand(&ct1, &ct2))));
    write_to_json_boolean(&id, params, parameter_name, "nand");

    let id = format!("OR::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.or(&ct1, &ct2))));
    write_to_json_boolean(&id, params, parameter_name, "or");

    let id = format!("XOR::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.xor(&ct1, &ct2))));
    write_to_json_boolean(&id, params, parameter_name, "xor");

    let id = format!("XNOR::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.xnor(&ct1, &ct2))));
    write_to_json_boolean(&id, params, parameter_name, "xnor");

    let id = format!("NOT::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.not(&ct1))));
    write_to_json_boolean(&id, params, parameter_name, "not");

    let id = format!("MUX::{parameter_name}");
+    println!("{id}");
    bench_group.bench_function(&id, |b| b.iter(|| black_box(sks.mux(&ct1, &ct2, &ct3))));
    write_to_json_boolean(&id, params, parameter_name, "mux");
 }
--- a/tfhe-benchmark/benches/core_crypto/ks_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_bench.rs
@@ -73,6 +73,7 @@ fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
                );

                bench_id = format!("{bench_name}::{name}");
+                println!("{bench_id}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
@@ -84,6 +85,7 @@ fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1); // FIXME This number of element do not staturate the target machine
                bench_group.throughput(Throughput::Elements(elements));
@@ -231,6 +233,7 @@ fn packing_keyswitch<Scalar, F>(
                );

                bench_id = format!("{bench_name}::{name}");
+                println!("{bench_id}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
@@ -242,6 +245,7 @@ fn packing_keyswitch<Scalar, F>(
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
@@ -414,6 +418,7 @@ mod cuda {
                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                    bench_id = format!("{bench_name}::{name}");
+                    println!("{bench_id}");
                    {
                        bench_group.bench_function(&bench_id, |b| {
                            b.iter(|| {
@@ -435,6 +440,7 @@ mod cuda {
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");
+                    println!("{bench_id}");
                    let blocks: usize = 1;
                    let elements = throughput_num_threads(blocks, 1);
                    let elements_per_stream = elements as usize / gpu_count;
@@ -644,6 +650,7 @@ mod cuda {
                    streams.synchronize();

                    bench_id = format!("{bench_name}::{name}");
+                    println!("{bench_id}");
                    {
                        bench_group.bench_function(&bench_id, |b| {
                            b.iter(|| {
@@ -663,6 +670,7 @@ mod cuda {
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");
+                    println!("{bench_id}");

                    let mem_size = get_packing_keyswitch_list_64_size_on_gpu(
                        &CudaStreams::new_single_gpu(GpuIndex::new(0)),
--- a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
@@ -111,6 +111,7 @@ fn ks_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
                );

                bench_id = format!("{bench_name}::{name}");
+                println!("{bench_id}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
@@ -134,6 +135,7 @@ fn ks_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                println!("Number of elements: {elements}"); // DEBUG
@@ -370,6 +372,7 @@ fn multi_bit_ks_pbs<
                );

                bench_id = format!("{bench_name}::{name}::parallelized");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    b.iter(|| {
                        keyswitch_lwe_ciphertext(
@@ -391,6 +394,7 @@ fn multi_bit_ks_pbs<
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                println!("Number of elements: {elements}"); // DEBUG
@@ -621,6 +625,7 @@ mod cuda {
                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                    bench_id = format!("{bench_name}::{name}");
+                    println!("{bench_id}");
                    {
                        bench_group.bench_function(&bench_id, |b| {
                            b.iter(|| {
@@ -652,6 +657,7 @@ mod cuda {
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");
+                    println!("{bench_id}");
                    let blocks: usize = 1;
                    let elements = throughput_num_threads(blocks, 1);
                    let elements_per_stream = elements as usize / gpu_count;
@@ -929,6 +935,7 @@ mod cuda {
                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                    bench_id = format!("{bench_name}::{name}");
+                    println!("{bench_id}");
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
                            cuda_keyswitch_lwe_ciphertext(
@@ -958,6 +965,7 @@ mod cuda {
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");
+                    println!("{bench_id}");
                    let blocks: usize = 1;
                    let elements = throughput_num_threads(blocks, 1);
                    let elements_per_stream = elements as usize / gpu_count;
--- a/tfhe-benchmark/benches/core_crypto/modulus_switch_noise_reduction.rs
+++ b/tfhe-benchmark/benches/core_crypto/modulus_switch_noise_reduction.rs
@@ -62,6 +62,7 @@ fn modulus_switch_noise_reduction(c: &mut Criterion) {
            .measurement_time(std::time::Duration::from_secs(5));

        let bench_name = format!("modulus_switch_noise_reduction_{count}");
+        println!("{bench_name}");

        bench_group.bench_function(&bench_name, |b| {
            b.iter(|| {
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -296,6 +296,7 @@ mod cuda {
                    CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);

                bench_id = format!("{bench_name}::{params_name}");
+                println!("{bench_id}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
@@ -317,6 +318,7 @@ mod cuda {
                let gpu_count = get_number_of_gpus() as usize;

                bench_id = format!("{bench_name}::throughput::{params_name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                let elements_per_stream = elements as usize / gpu_count;
@@ -541,6 +543,7 @@ mod cuda {
                let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                bench_id = format!("{bench_name}::{params_name}");
+                println!("{bench_id}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
@@ -564,6 +567,7 @@ mod cuda {
                let gpu_count = get_number_of_gpus() as usize;

                bench_id = format!("{bench_name}::throughput::{params_name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                let elements_per_stream = elements as usize / gpu_count;
--- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
@@ -97,6 +97,7 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
                );

                bench_id = format!("{bench_name}::{name}");
+                println!("{bench_id}");

                bench_group.bench_function(&bench_id, |b| {
                    b.iter(|| {
@@ -114,6 +115,7 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
@@ -326,6 +328,7 @@ fn mem_optimized_batched_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize
        );

                bench_id = format!("{bench_name}::{name}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    b.iter(|| {
                        batch_programmable_bootstrap_lwe_ciphertext_mem_optimized(
@@ -342,6 +345,7 @@ fn mem_optimized_batched_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
@@ -552,6 +556,7 @@ fn multi_bit_pbs<
                );

                bench_id = format!("{bench_name}::{name}::parallelized");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    b.iter(|| {
                        multi_bit_programmable_bootstrap_lwe_ciphertext(
@@ -568,6 +573,7 @@ fn multi_bit_pbs<
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
@@ -779,6 +785,7 @@ fn mem_optimized_pbs_ntt(c: &mut Criterion) {
                buffers.resize(stack_size);

                bench_id = format!("{bench_name}::{name}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    b.iter(|| {
                        programmable_bootstrap_ntt64_lwe_ciphertext_mem_optimized(
@@ -795,6 +802,7 @@ fn mem_optimized_pbs_ntt(c: &mut Criterion) {
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
+                println!("{bench_id}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
@@ -1020,6 +1028,7 @@ mod cuda {
                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                    bench_id = format!("{bench_name}::{name}");
+                    println!("{bench_id}");
                    {
                        bench_group.bench_function(&bench_id, |b| {
                            b.iter(|| {
@@ -1043,6 +1052,7 @@ mod cuda {
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");
+                    println!("{bench_id}");
                    let blocks: usize = 1;
                    let elements = throughput_num_threads(blocks, 1);
                    let elements_per_stream = elements as usize / gpu_count;
@@ -1280,6 +1290,7 @@ mod cuda {
                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                    bench_id = format!("{bench_name}::{name}");
+                    println!("{bench_id}");
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
                            cuda_multi_bit_programmable_bootstrap_lwe_ciphertext(
@@ -1301,6 +1312,7 @@ mod cuda {
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");
+                    println!("{bench_id}");
                    let blocks: usize = 1;
                    let elements = throughput_num_threads(blocks, 1);
                    let elements_per_stream = elements as usize / gpu_count;
--- a/tfhe-benchmark/benches/high_level_api/bench.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench.rs
@@ -70,6 +70,7 @@ fn bench_fhe_type<FheType>(
    let mut bench_id;

    bench_id = format!("{bench_prefix}::add::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs + &rhs;
@@ -80,6 +81,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "add");

    bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let (res, flag) = lhs.overflowing_add(&rhs);
@@ -90,6 +92,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "overflowing_add");

    bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let (res, flag) = lhs.overflowing_sub(&rhs);
@@ -100,6 +103,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "overflowing_sub");

    bench_id = format!("{bench_prefix}::sub::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs - &rhs;
@@ -110,6 +114,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "sub");

    bench_id = format!("{bench_prefix}::mul::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs * &rhs;
@@ -120,6 +125,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "mul");

    bench_id = format!("{bench_prefix}::bitand::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs & &rhs;
@@ -130,6 +136,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "bitand");

    bench_id = format!("{bench_prefix}::bitor::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs | &rhs;
@@ -140,6 +147,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "bitor");

    bench_id = format!("{bench_prefix}::bitxor::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs ^ &rhs;
@@ -150,6 +158,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "bitxor");

    bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs << &rhs;
@@ -160,6 +169,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "left_shift");

    bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = &lhs >> &rhs;
@@ -170,6 +180,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "right_shift");

    bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = (&lhs).rotate_left(&rhs);
@@ -180,6 +191,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "left_rotate");

    bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = (&lhs).rotate_right(&rhs);
@@ -190,6 +202,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "right_rotate");

    bench_id = format!("{bench_prefix}::min::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = lhs.min(&rhs);
@@ -200,6 +213,7 @@ fn bench_fhe_type<FheType>(
    write_record(bench_id, "min");

    bench_id = format!("{bench_prefix}::max::{param_name}::{bit_size}_bits");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let res = lhs.max(&rhs);
@@ -294,12 +308,14 @@ where
    let mut kv_store = KVStore::new();
    let mut rng = rand::thread_rng();

-    let format_id_bench = |op_name: &str| -> String {
-        format!(
+    let format_and_print_bench_id = |op_name: &str| -> String {
+        let bench_id = format!(
            "KVStore::<{}, {}>::{op_name}/{num_elements}",
            TypeDisplayer::<Key>::default(),
            TypeDisplayer::<Value>::default(),
-        )
+        );
+        println!("{bench_id}");
+        bench_id
    };

    match BenchmarkType::from_env().unwrap() {
@@ -318,19 +334,19 @@ where
            let value = rng.gen::<u128>();
            let value_to_add = Value::encrypt(value, cks);

-            c.bench_function(&format_id_bench("Get"), |b| {
+            c.bench_function(&format_and_print_bench_id("Get"), |b| {
                b.iter(|| {
                    let _ = kv_store.get(&encrypted_key);
                })
            });

-            c.bench_function(&format_id_bench("Update"), |b| {
+            c.bench_function(&format_and_print_bench_id("Update"), |b| {
                b.iter(|| {
                    let _ = kv_store.update(&encrypted_key, &value_to_add);
                })
            });

-            c.bench_function(&format_id_bench("Map"), |b| {
+            c.bench_function(&format_and_print_bench_id("Map"), |b| {
                b.iter(|| {
                    kv_store.map(&encrypted_key, |v| v);
                })
@@ -367,7 +383,7 @@ where
            let mut group = c.benchmark_group("KVStore Throughput");
            group.throughput(Throughput::Elements(kv_stores.len() as u64));

-            group.bench_function(format_id_bench("Map"), |b| {
+            group.bench_function(format_and_print_bench_id("Map"), |b| {
                b.iter(|| {
                    kv_stores.par_iter_mut().for_each(|kv_store| {
                        kv_store.map(&encrypted_key, |v| v);
@@ -375,7 +391,7 @@ where
                })
            });

-            group.bench_function(format_id_bench("Update"), |b| {
+            group.bench_function(format_and_print_bench_id("Update"), |b| {
                b.iter(|| {
                    kv_stores.par_iter_mut().for_each(|kv_store| {
                        kv_store.update(&encrypted_key, &value_to_add);
@@ -383,7 +399,7 @@ where
                })
            });

-            group.bench_function(format_id_bench("Get"), |b| {
+            group.bench_function(format_and_print_bench_id("Get"), |b| {
                b.iter(|| {
                    kv_stores.par_iter_mut().for_each(|kv_store| {
                        kv_store.get(&encrypted_key);
--- a/tfhe-benchmark/benches/high_level_api/dex.rs
+++ b/tfhe-benchmark/benches/high_level_api/dex.rs
@@ -483,6 +483,7 @@ fn bench_swap_request_latency<FheType, F1, F2>(
    let params_name = params.name();

    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
+    println!("{bench_id}");
    c.bench_function(&bench_id, |b| {
        let mut rng = thread_rng();

@@ -556,6 +557,7 @@ fn bench_swap_request_throughput<FheType, F1, F2>(
        let bench_id = format!(
            "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
        );
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let from_balances_0 = (0..num_elems)
                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
@@ -679,6 +681,7 @@ fn cuda_bench_swap_request_throughput<FheType, F1, F2>(
        let bench_id = format!(
            "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
        );
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let from_balances_0 = (0..num_elems)
                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
@@ -881,6 +884,7 @@ fn bench_swap_claim_latency<FheType, F1, F2>(
    let params_name = params.name();

    let bench_id = format!("{bench_name}::{fn_name}::{params_name}::{type_name}");
+    println!("{bench_id}");
    c.bench_function(&bench_id, |b| {
        let mut rng = thread_rng();

@@ -960,6 +964,7 @@ fn bench_swap_claim_throughput<FheType, F1, F2>(
        let bench_id = format!(
            "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
        );
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let pending_0_in = (0..num_elems)
                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
@@ -1101,6 +1106,7 @@ fn cuda_bench_swap_claim_throughput<FheType, F1, F2>(
        let bench_id = format!(
            "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
        );
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let pending_0_in = (0..num_elems)
                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -356,6 +356,7 @@ fn bench_transfer_latency<FheType, F>(
    let params_name = params.name();

    let bench_id = format!("{bench_name}::{fn_name}::{params_name}::{type_name}");
+    println!("{bench_id}");
    c.bench_function(&bench_id, |b| {
        let mut rng = thread_rng();

@@ -409,6 +410,7 @@ fn bench_transfer_latency_simd<FheType, F>(
    let params_name = params.name();

    let bench_id = format!("{bench_name}::{fn_name}::{params_name}::{type_name}");
+    println!("{bench_id}");
    c.bench_function(&bench_id, |b| {
        let mut rng = thread_rng();

@@ -466,6 +468,7 @@ fn bench_transfer_throughput<FheType, F>(
        let bench_id = format!(
            "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
        );
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let from_amounts = (0..num_elems)
                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
@@ -529,6 +532,7 @@ fn cuda_bench_transfer_throughput<FheType, F>(
    let bench_id = format!(
        "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
    );
+    println!("{bench_id}");
    group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
        let from_amounts = (0..num_elems)
            .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
@@ -610,6 +614,7 @@ fn hpu_bench_transfer_throughput<FheType, F>(
        let bench_id = format!(
            "{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{num_elems}_elems"
        );
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let from_amounts = (0..num_elems)
                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
@@ -683,6 +688,7 @@ fn hpu_bench_transfer_throughput_simd<FheType, F>(
        group.throughput(Throughput::Elements(real_num_elems));
        let bench_id =
            format!("{bench_name}::throughput::{fn_name}::{params_name}::{type_name}::{real_num_elems}_elems");
+        println!("{bench_id}");
        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
            let from_amounts = (0..num_elems)
                .map(|_| {
--- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -91,6 +91,7 @@ fn bench_sns_only_fhe_type<FheType>(
    match get_bench_type() {
        BenchmarkType::Latency => {
            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
+            println!("{bench_id}");

            #[cfg(feature = "gpu")]
            configure_gpu(&client_key);
@@ -105,6 +106,7 @@ fn bench_sns_only_fhe_type<FheType>(
        }
        BenchmarkType::Throughput => {
            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
+            println!("{bench_id}");
            let params = client_key.computation_parameters();
            let num_blocks = num_bits
                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
@@ -233,6 +235,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
    match get_bench_type() {
        BenchmarkType::Latency => {
            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
+            println!("{bench_id}");

            #[cfg(feature = "gpu")]
            configure_gpu(&client_key);
@@ -255,6 +258,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
        }
        BenchmarkType::Throughput => {
            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
+            println!("{bench_id}");
            let params = client_key.computation_parameters();
            let num_blocks = num_bits
                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
--- a/tfhe-benchmark/benches/integer/aes.rs
+++ b/tfhe-benchmark/benches/integer/aes.rs
@@ -44,6 +44,7 @@ pub mod cuda {
            const NUM_AES_INPUTS: usize = 1;
            const SBOX_PARALLELISM: usize = 16;
            let bench_id = format!("{param_name}::{NUM_AES_INPUTS}_input_encryption");
+            println!("{bench_id}");

            let round_keys = sks.key_expansion(&d_key, &streams);

@@ -73,7 +74,7 @@ pub mod cuda {

        {
            let bench_id = format!("{param_name}::key_expansion");
-
+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                b.iter(|| {
                    black_box(sks.key_expansion(&d_key, &streams));
@@ -109,6 +110,7 @@ pub mod cuda {

            let round_keys = sks.key_expansion(&d_key, &streams);

+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                b.iter(|| {
                    black_box(sks.aes_encrypt(
--- a/tfhe-benchmark/benches/integer/aes256.rs
+++ b/tfhe-benchmark/benches/integer/aes256.rs
@@ -48,6 +48,7 @@ pub mod cuda {
            const NUM_AES_INPUTS: usize = 1;
            const SBOX_PARALLELISM: usize = 16;
            let bench_id = format!("{param_name}::{NUM_AES_INPUTS}_input_encryption");
+            println!("{bench_id}");

            let round_keys = sks.key_expansion_256(&d_key, &streams);

@@ -77,7 +78,7 @@ pub mod cuda {

        {
            let bench_id = format!("{param_name}::key_expansion");
-
+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                b.iter(|| {
                    black_box(sks.key_expansion_256(&d_key, &streams));
@@ -114,6 +115,7 @@ pub mod cuda {

            let round_keys = sks.key_expansion_256(&d_key, &streams);

+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                b.iter(|| {
                    black_box(sks.aes_256_encrypt(
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -55,6 +55,7 @@ fn bench_server_key_binary_function_dirty_inputs<F>(
        let keys = LazyCell::new(move || KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix));

        let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            let (cks, sks) = (&keys.0, &keys.1);

@@ -138,6 +139,7 @@ fn bench_server_key_binary_function_clean_inputs<F>(
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, ct_0, ct_1) = (&bench_data.0, &bench_data.1, &bench_data.2);
                    b.iter(|| {
@@ -159,6 +161,7 @@ fn bench_server_key_binary_function_clean_inputs<F>(
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group
                    .sample_size(10)
                    .measurement_time(std::time::Duration::from_secs(30));
@@ -229,6 +232,7 @@ fn bench_server_key_unary_function_dirty_inputs<F>(
        let keys = LazyCell::new(move || KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix));

        let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            let (cks, sks) = (&keys.0, &keys.1);

@@ -307,6 +311,7 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, ct_0) = (&bench_data.0, &bench_data.1);

@@ -327,6 +332,7 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group
                    .sample_size(10)
                    .measurement_time(std::time::Duration::from_secs(30));
@@ -389,6 +395,7 @@ fn bench_server_key_binary_scalar_function_dirty_inputs<F, G>(
        let keys = LazyCell::new(move || KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix));

        let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            let (cks, sks) = (&keys.0, &keys.1);

@@ -476,6 +483,7 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, ct_0, clear_1) = (&bench_data.0, &bench_data.1, bench_data.2);

@@ -497,6 +505,7 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group
                    .sample_size(10)
                    .measurement_time(std::time::Duration::from_secs(30));
@@ -605,6 +614,7 @@ fn if_then_else_parallelized(c: &mut Criterion) {
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, condition, true_ct, false_ct) =
                        (&bench_data.0, &bench_data.1, &bench_data.2, &bench_data.3);
@@ -629,6 +639,7 @@ fn if_then_else_parallelized(c: &mut Criterion) {
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group
                    .sample_size(10)
                    .measurement_time(std::time::Duration::from_secs(30));
@@ -713,6 +724,7 @@ fn flip_parallelized(c: &mut Criterion) {
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, condition, true_ct, false_ct) =
                        (&bench_data.0, &bench_data.1, &bench_data.2, &bench_data.3);
@@ -737,6 +749,7 @@ fn flip_parallelized(c: &mut Criterion) {
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group
                    .sample_size(10)
                    .measurement_time(std::time::Duration::from_secs(30));
@@ -826,6 +839,7 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
                    });

                    bench_id = format!("{bench_name}_{len}_ctxts::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    bench_group.bench_function(&bench_id, |b| {
                        let (sks, ctxts) = (&bench_data.0, &bench_data.1);

@@ -855,6 +869,7 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
                    bench_id = format!(
                        "{bench_name}_{len}_ctxts::throughput::{param_name}::{bit_size}_bits"
                    );
+                    println!("{bench_id}");
                    bench_group
                        .sample_size(10)
                        .measurement_time(std::time::Duration::from_secs(30));
@@ -1531,6 +1546,7 @@ mod cuda {
                BenchmarkType::Latency => {
                    let streams = CudaStreams::new_multi_gpu();
                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -1565,6 +1581,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    bench_group
                        .sample_size(10)
                        .measurement_time(std::time::Duration::from_secs(30));
@@ -1649,6 +1666,7 @@ mod cuda {
                BenchmarkType::Latency => {
                    let streams = CudaStreams::new_multi_gpu();
                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -1692,6 +1710,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    bench_group
                        .sample_size(10)
                        .measurement_time(std::time::Duration::from_secs(30));
@@ -1796,6 +1815,7 @@ mod cuda {
                        .measurement_time(std::time::Duration::from_secs(30));
                    bench_id =
                        format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}"); // FIXME it makes no sense to duplicate `bit_size`
+                    println!("{bench_id}");
                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
@@ -1841,6 +1861,7 @@ mod cuda {
                    bench_id = format!(
                        "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
                    );
+                    println!("{bench_id}");
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
@@ -1922,6 +1943,7 @@ mod cuda {
                    let stream = CudaStreams::new_multi_gpu();

                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -1972,6 +1994,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    bench_group
                        .sample_size(10)
                        .measurement_time(std::time::Duration::from_secs(30));
@@ -2890,6 +2913,7 @@ mod cuda {
                    target_num_blocks * param.message_modulus().0.ilog2() as usize;
                let bench_id =
                    format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
                    let gpu_sks = CudaServerKey::new(&cks, &stream);
@@ -2994,6 +3018,7 @@ mod hpu {
            match get_bench_type() {
                BenchmarkType::Latency => {
                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
                        let hpu_device_mutex = KEY_CACHE.get_hpu_device(param);
@@ -3046,6 +3071,7 @@ mod hpu {
                }
                BenchmarkType::Throughput => {
                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    bench_group
                        .sample_size(10)
                        .measurement_time(std::time::Duration::from_secs(120));
@@ -3675,6 +3701,7 @@ fn bench_server_key_cast_function<F>(
        for target_num_blocks in all_num_blocks.iter().copied() {
            let target_bit_size = target_num_blocks * param.message_modulus().0.ilog2() as usize;
            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);

--- a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
@@ -54,6 +54,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
                builder.push(ct);

                bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
+                println!("{bench_id_pack}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        let compressed = builder.build(&compression_key);
@@ -65,6 +66,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
                let compressed = builder.build(&compression_key);

                bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
+                println!("{bench_id_unpack}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        let unpacked: RadixCiphertext =
@@ -105,6 +107,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
                    .collect::<Vec<_>>();

                bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
+                println!("{bench_id_pack}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        builders.par_iter().for_each(|builder| {
@@ -119,6 +122,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
                    .collect::<Vec<_>>();

                bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
+                println!("{bench_id_unpack}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        compressed.par_iter().for_each(|comp| {
@@ -225,6 +229,7 @@ mod cuda {
                builder.push(d_ct, &stream);

                bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
+                println!("{bench_id_pack}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        let compressed = builder.build(&cuda_compression_key, &stream);
@@ -250,6 +255,7 @@ mod cuda {
                let local_streams = cuda_local_streams(num_block, elements as usize);

                bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
+                println!("{bench_id_pack}");
                let cuda_compression_key_vec = (0..get_number_of_gpus())
                    .into_par_iter()
                    .map(|i| {
@@ -353,6 +359,7 @@ mod cuda {
                let compressed = builder.build(&cuda_compression_key, &stream);

                bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
+                println!("{bench_id_unpack}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        let unpacked: CudaUnsignedRadixCiphertext = compressed
@@ -381,6 +388,7 @@ mod cuda {
                let local_streams = cuda_local_streams(num_block, elements as usize);

                bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
+                println!("{bench_id_unpack}");
                let builders = (0..elements)
                    .into_par_iter()
                    .map(|i| {
--- a/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs
@@ -89,6 +89,7 @@ mod cuda {
                builder.push(d_ns_ct, &stream);

                bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
+                println!("{bench_id_pack}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        let compressed =
@@ -157,6 +158,7 @@ mod cuda {
                    .collect::<Vec<_>>();

                bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
+                println!("{bench_id_pack}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        builders.par_iter().for_each(
@@ -233,6 +235,7 @@ mod cuda {
                let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream);

                bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
+                println!("{bench_id_unpack}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        let unpacked: CudaSquashedNoiseRadixCiphertext =
@@ -310,6 +313,7 @@ mod cuda {
                    .collect::<Vec<_>>();

                bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
+                println!("{bench_id_unpack}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        compressed.par_iter().enumerate().for_each(|(i, comp)| {
--- a/tfhe-benchmark/benches/integer/oprf.rs
+++ b/tfhe-benchmark/benches/integer/oprf.rs
@@ -31,6 +31,7 @@ pub fn unsigned_oprf(c: &mut Criterion) {
                bench_id_oprf_bounded =
                    format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");

+                println!("{bench_id_oprf}");
                bench_group.bench_function(&bench_id_oprf, |b| {
                    let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);

@@ -42,6 +43,7 @@ pub fn unsigned_oprf(c: &mut Criterion) {
                    })
                });

+                println!("{bench_id_oprf_bounded}");
                bench_group.bench_function(&bench_id_oprf_bounded, |b| {
                    let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);

@@ -75,6 +77,7 @@ pub fn unsigned_oprf(c: &mut Criterion) {
                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));

+                println!("{bench_id_oprf}");
                bench_group.bench_function(&bench_id_oprf, |b| {
                    b.iter(|| {
                        (0..elements).into_par_iter().for_each(|_| {
@@ -86,6 +89,7 @@ pub fn unsigned_oprf(c: &mut Criterion) {
                    })
                });

+                println!("{bench_id_oprf_bounded}");
                bench_group.bench_function(&bench_id_oprf_bounded, |b| {
                    b.iter(|| {
                        (0..elements).into_par_iter().for_each(|_| {
@@ -152,6 +156,7 @@ pub mod cuda {
                    bench_id_oprf_bounded =
                        format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");

+                    println!("{bench_id_oprf}");
                    bench_group.bench_function(&bench_id_oprf, |b| {
                        let (cks, _cpu_sks) =
                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
@@ -168,6 +173,7 @@ pub mod cuda {
                        })
                    });

+                    println!("{bench_id_oprf_bounded}");
                    bench_group.bench_function(&bench_id_oprf_bounded, |b| {
                        let (cks, _cpu_sks) =
                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
@@ -206,6 +212,7 @@ pub mod cuda {
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));

+                    println!("{bench_id_oprf}");
                    bench_group.bench_function(&bench_id_oprf, |b| {
                        b.iter(|| {
                            (0..elements).into_par_iter().for_each(|i| {
@@ -221,6 +228,7 @@ pub mod cuda {
                        })
                    });

+                    println!("{bench_id_oprf_bounded}");
                    bench_group.bench_function(&bench_id_oprf_bounded, |b| {
                        b.iter(|| {
                            (0..elements).into_par_iter().for_each(|i| {
--- a/tfhe-benchmark/benches/integer/rerand.rs
+++ b/tfhe-benchmark/benches/integer/rerand.rs
@@ -69,6 +69,7 @@ fn execute_cpu_re_randomize(c: &mut Criterion, bit_size: usize) {
            let mut d_re_randomized = decompressed.clone();

            bench_id = format!("{bench_name}::latency_u{bit_size}");
+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                b.iter_batched(
                    || {
@@ -116,6 +117,7 @@ fn execute_cpu_re_randomize(c: &mut Criterion, bit_size: usize) {
                .collect();

            bench_id = format!("{bench_name}::throughput_u{bit_size}");
+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                b.iter_batched(
                    || {
@@ -257,6 +259,7 @@ mod cuda {
                let mut d_re_randomized = d_decompressed.duplicate(&streams);

                bench_id = format!("{bench_name}::latency_u{bit_size}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    b.iter_batched(
                        || {
@@ -331,6 +334,7 @@ mod cuda {
                    .collect();

                bench_id = format!("{bench_name}::throughput_u{bit_size}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    b.iter_batched(
                        || {
--- a/tfhe-benchmark/benches/integer/signed_bench.rs
+++ b/tfhe-benchmark/benches/integer/signed_bench.rs
@@ -55,6 +55,7 @@ fn bench_server_key_signed_binary_function_clean_inputs<F>(
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, ct_0, ct_1) = (&bench_data.0, &bench_data.1, &bench_data.2);

@@ -153,6 +154,7 @@ fn bench_server_key_signed_shift_function_clean_inputs<F>(
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, ct_0, ct_1) = (&bench_data.0, &bench_data.1, &bench_data.2);

@@ -174,6 +176,7 @@ fn bench_server_key_signed_shift_function_clean_inputs<F>(
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
@@ -251,6 +254,7 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, ct_0) = (&bench_data.0, &bench_data.1);
                    b.iter(|| {
@@ -269,6 +273,7 @@ fn bench_server_key_unary_function_clean_inputs<F>(
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
@@ -333,6 +338,7 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) {
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, condition, true_ct, false_ct) =
                        (&bench_data.0, &bench_data.1, &bench_data.2, &bench_data.3);
@@ -353,6 +359,7 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) {
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
@@ -858,6 +865,7 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
        match get_bench_type() {
            BenchmarkType::Latency => {
                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);

@@ -894,6 +902,7 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                let elements = throughput_num_threads(num_block, pbs_count);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
@@ -1046,6 +1055,7 @@ fn signed_flip_parallelized(c: &mut Criterion) {
                });

                bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (sks, condition, true_ct, false_ct) =
                        (&bench_data.0, &bench_data.1, &bench_data.2, &bench_data.3);
@@ -1070,6 +1080,7 @@ fn signed_flip_parallelized(c: &mut Criterion) {
                let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                println!("{bench_id}");
                bench_group
                    .sample_size(10)
                    .measurement_time(std::time::Duration::from_secs(30));
@@ -1448,6 +1459,7 @@ fn bench_server_key_signed_cast_function<F>(
        for target_num_blocks in all_num_blocks.iter().copied() {
            let target_bit_size = target_num_blocks * param.message_modulus().0.ilog2() as usize;
            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+            println!("{bench_id}");
            bench_group.bench_function(&bench_id, |b| {
                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);

@@ -1543,6 +1555,7 @@ mod cuda {
                    let stream = CudaStreams::new_multi_gpu();

                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -1594,6 +1607,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
@@ -1714,6 +1728,7 @@ mod cuda {
                    let stream = CudaStreams::new_multi_gpu();

                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -1752,6 +1767,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
@@ -1858,6 +1874,7 @@ mod cuda {

                    bench_id =
                        format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}");
+                    println!("{bench_id}");
                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
                            KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
@@ -1904,6 +1921,7 @@ mod cuda {
                    bench_id = format!(
                        "{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
                    );
+                    println!("{bench_id}");
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
@@ -2020,6 +2038,7 @@ mod cuda {
                    let streams = CudaStreams::new_multi_gpu();

                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -2071,6 +2090,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
@@ -2185,6 +2205,7 @@ mod cuda {
                    let stream = CudaStreams::new_multi_gpu();

                    bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");

                    bench_group.bench_function(&bench_id, |b| {
                        let (cks, _cpu_sks) =
@@ -2235,6 +2256,7 @@ mod cuda {
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default

                    bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
+                    println!("{bench_id}");
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.bench_function(&bench_id, |b| {
@@ -3054,6 +3076,7 @@ mod cuda {
                    target_num_blocks * param.message_modulus().0.ilog2() as usize;
                let bench_id =
                    format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+                println!("{bench_id}");
                bench_group.bench_function(&bench_id, |b| {
                    let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
                    let gpu_sks = CudaServerKey::new(&cks, &stream);
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
@@ -128,6 +128,7 @@ fn cpu_pke_zk_proof(c: &mut Criterion) {
                            bench_id = format!(
                                "{bench_name}::{param_name}_{bits}_bits_packed_{crs_size}_bits_crs_{zk_load}_ZK{zk_vers:?}"
                            );
+                            println!("{bench_id}");
                            bench_group.bench_function(&bench_id, |b| {
                                let input_msg = rng.gen::<u64>();
                                let messages = vec![input_msg; fhe_uint_count];
@@ -149,6 +150,7 @@ fn cpu_pke_zk_proof(c: &mut Criterion) {
                            bench_id = format!(
                                "{bench_name}::throughput::{param_name}_{bits}_bits_packed_{crs_size}_bits_crs_{zk_load}_ZK{zk_vers:?}"
                            );
+                            println!("{bench_id}");
                            bench_group.bench_function(&bench_id, |b| {
                                let messages = (0..elements)
                                    .map(|_| {
@@ -340,12 +342,14 @@ fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
                                vec![],
                            );

+                            println!("{bench_id_verify}");
                            bench_group.bench_function(&bench_id_verify, |b| {
                                b.iter(|| {
                                    let _ret = ct1.verify(&crs, &pk, &metadata);
                                });
                            });

+                            println!("{bench_id_verify_and_expand}");
                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                            b.iter(|| {
                                let _ret = ct1
@@ -386,6 +390,7 @@ fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
                                })
                                .collect::<Vec<_>>();

+                            println!("{bench_id_verify}");
                            bench_group.bench_function(&bench_id_verify, |b| {
                                b.iter(|| {
                                    cts.par_iter().for_each(|ct1| {
@@ -394,6 +399,7 @@ fn cpu_pke_zk_verify(c: &mut Criterion, results_file: &Path) {
                                });
                            });

+                            println!("{bench_id_verify_and_expand}");
                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                            b.iter(|| {
                                cts.par_iter().for_each(|ct1| {
@@ -622,12 +628,14 @@ mod cuda {
                                    vec![],
                                );

+                                println!("{bench_id_verify}");
                                bench_group.bench_function(&bench_id_verify, |b| {
                                    b.iter(|| {
                                        let _ret = ct1.verify(&crs, &pk, &metadata);
                                    });
                                });

+                                println!("{bench_id_expand_without_verify}");
                                bench_group.bench_function(&bench_id_expand_without_verify, |b| {
                                    b.iter(|| {
                                        let _ret = gpu_ct1
@@ -636,6 +644,7 @@ mod cuda {
                                    });
                                });

+                                println!("{bench_id_verify_and_expand}");
                                bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                                    b.iter(|| {
                                        let _ret = gpu_ct1
@@ -688,6 +697,7 @@ mod cuda {
                                    })
                                    .collect::<Vec<_>>();

+                                println!("{bench_id_verify}");
                                bench_group.bench_function(&bench_id_verify, |b| {
                                    b.iter(|| {
                                        cts.par_iter().for_each(|ct1| {
@@ -696,6 +706,7 @@ mod cuda {
                                    });
                                });

+                                println!("{bench_id_expand_without_verify}");
                                bench_group.bench_function(&bench_id_expand_without_verify, |b| {
                                    let setup_encrypted_values = || {
                                        let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
@@ -725,6 +736,7 @@ mod cuda {
                                                   }, BatchSize::SmallInput);
                                });

+                                println!("{bench_id_verify_and_expand}");
                                bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                                    let setup_encrypted_values = || {
                                        let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
--- a/tfhe-benchmark/benches/shortint/bench.rs
+++ b/tfhe-benchmark/benches/shortint/bench.rs
@@ -34,6 +34,7 @@ fn bench_server_key_unary_function<F>(
        let mut ct = cks.encrypt(clear_text);

        let bench_id = format!("{bench_name}::{}", param.name());
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            b.iter(|| {
                unary_op(sks, &mut ct);
@@ -79,6 +80,7 @@ fn bench_server_key_binary_function<F>(
        let mut ct_1 = cks.encrypt(clear_1);

        let bench_id = format!("{bench_name}::{}", param.name());
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            b.iter(|| {
                binary_op(sks, &mut ct_0, &mut ct_1);
@@ -123,6 +125,7 @@ fn bench_server_key_binary_scalar_function<F>(
        let mut ct_0 = cks.encrypt(clear_0);

        let bench_id = format!("{bench_name}::{}", param.name());
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            b.iter(|| {
                binary_op(sks, &mut ct_0, clear_1 as u8);
@@ -171,6 +174,7 @@ fn bench_server_key_binary_scalar_division_function<F>(
        let mut ct_0 = cks.encrypt(clear_0);

        let bench_id = format!("{bench_name}::{}", param.name());
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            b.iter(|| {
                binary_op(sks, &mut ct_0, clear_1 as u8);
@@ -207,6 +211,7 @@ fn carry_extract_bench(c: &mut Criterion) {
        let ct_0 = cks.encrypt(clear_0);

        let bench_id = format!("shortint::carry_extract::{}", param.name());
+        println!("{bench_id}");
        bench_group.bench_function(&bench_id, |b| {
            b.iter(|| {
                let _ = sks.carry_extract(&ct_0);
@@ -245,6 +250,7 @@ fn programmable_bootstrapping_bench(c: &mut Criterion) {
        let ctxt = cks.encrypt(clear_0);

        let bench_id = format!("shortint::programmable_bootstrap::{}", param.name());
+        println!("{bench_id}");

        bench_group.bench_function(&bench_id, |b| {
            b.iter(|| {
@@ -288,6 +294,7 @@ fn server_key_from_compressed_key(c: &mut Criterion) {
        let sks_compressed = CompressedServerKey::new(keys.client_key());

        let bench_id = format!("shortint::uncompress_key::{}", param.name());
+        println!("{bench_id}");

        bench_group.bench_function(&bench_id, |b| {
            let clone_compressed_key = || sks_compressed.clone();
--- a/tfhe-benchmark/benches/shortint/casting.rs
+++ b/tfhe-benchmark/benches/shortint/casting.rs
@@ -26,6 +26,7 @@ pub fn pack_cast_64(c: &mut Criterion) {
    let vec_ct = vec![client_key_1.encrypt(1); 64];

    let bench_id = format!("{bench_name}_{ks_param_name}");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let _ = (0..32)
@@ -78,6 +79,7 @@ pub fn pack_cast(c: &mut Criterion) {
    let ct_2 = client_key_1.encrypt(1);

    let bench_id = format!("{bench_name}_{ks_param_name}");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let _ = ksk.cast(
@@ -118,6 +120,7 @@ pub fn cast(c: &mut Criterion) {
    let ct = client_key_1.encrypt(1);

    let bench_id = format!("{bench_name}_{ks_param_name}");
+    println!("{bench_id}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
            let _ = ksk.cast(&ct);
--- a/tfhe-benchmark/benches/shortint/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/shortint/glwe_packing_compression.rs
@@ -24,6 +24,8 @@ fn glwe_packing(c: &mut Criterion) {

    let ct: Vec<_> = (0..number_to_pack).map(|_| cks.encrypt(0)).collect();

+    let bench_id = format!("{bench_name}::pack");
+    println!("{bench_id}");
    bench_group.bench_function("pack".to_owned(), |b| {
        b.iter(|| {
            let packed = compression_key.compress_ciphertexts_into_list(&ct);
@@ -32,8 +34,10 @@ fn glwe_packing(c: &mut Criterion) {
        })
    });

+    let bench_id = format!("{bench_name}::unpack_all");
+    println!("{bench_id}");
    let packed = compression_key.compress_ciphertexts_into_list(&ct);
-    bench_group.bench_function("unpack_all".to_owned(), |b| {
+    bench_group.bench_function(bench_id, |b| {
        b.iter(|| {
            (0..number_to_pack).into_par_iter().for_each(|i| {
                let unpacked = decompression_key.unpack(&packed, i);
@@ -43,7 +47,9 @@ fn glwe_packing(c: &mut Criterion) {
        })
    });

-    bench_group.bench_function("unpack_one_lwe".to_owned(), |b| {
+    let bench_id = format!("{bench_name}::unpack_one_lwe");
+    println!("{bench_id}");
+    bench_group.bench_function(bench_id, |b| {
        b.iter(|| {
            let unpacked = decompression_key.unpack(&packed, 0);

@@ -51,7 +57,9 @@ fn glwe_packing(c: &mut Criterion) {
        })
    });

-    bench_group.bench_function("unpack_64b".to_owned(), |b| {
+    let bench_id = format!("{bench_name}::unpack_64b");
+    println!("{bench_id}");
+    bench_group.bench_function(bench_id, |b| {
        b.iter(|| {
            (0..32).into_par_iter().for_each(|i| {
                let unpacked = decompression_key.unpack(&packed, i);
@@ -61,7 +69,9 @@ fn glwe_packing(c: &mut Criterion) {
        })
    });

-    bench_group.bench_function("pack_unpack".to_owned(), |b| {
+    let bench_id = format!("{bench_name}::pack_unpack");
+    println!("{bench_id}");
+    bench_group.bench_function(bench_id, |b| {
        b.iter(|| {
            let packed = compression_key.compress_ciphertexts_into_list(&ct);

--- a/tfhe-benchmark/benches/shortint/oprf.rs
+++ b/tfhe-benchmark/benches/shortint/oprf.rs
@@ -14,7 +14,9 @@ fn oprf(c: &mut Criterion) {
    let keys = KEY_CACHE.get_from_param(param);
    let sks = keys.server_key();

-    bench_group.bench_function(format!("2-bits-oprf::{}", param.name()), |b| {
+    let bench_id = format!("2-bits-oprf::{}", param.name());
+    println!("{bench_id}");
+    bench_group.bench_function(bench_id, |b| {
        b.iter(|| {
            _ = black_box(sks.generate_oblivious_pseudo_random(Seed(0), 2));
        })