fix(gpu): fix 128-bit compression benchmark

2026-01-09 22:57:59 -05:00 · 2025-10-10 13:41:46 -03:00
parent 7b797b8af9
commit 70773e442c
4 changed files with 785 additions and 377 deletions
--- a/7
+++ b/7
@@ -1390,6 +1390,13 @@ bench_integer_compression_gpu: install_rs_check_toolchain
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
 .PHONY: bench_integer_compression_128b_gpu
 bench_integer_compression_128b_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression_128b-integer-bench \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -96,6 +96,12 @@ path = "benches/integer/glwe_packing_compression.rs"
 harness = false
 required-features = ["integer", "pbs-stats", "internal-keycache"]
 [[bench]]
 name = "glwe_packing_compression_128b-integer-bench"
 path = "benches/integer/glwe_packing_compression_128b.rs"
 harness = false
 required-features = ["integer", "pbs-stats", "internal-keycache"]
 [[bench]]
 name = "integer"
 path = "benches/integer/bench.rs"
--- a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
@@ -159,20 +159,30 @@ fn cpu_glwe_packing(c: &mut Criterion) {
 mod cuda {
    use super::*;
    use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
-    use itertools::Itertools;
+    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
-    use std::cmp::max;
+    use tfhe::integer::compression_keys::CompressionPrivateKeys;
    use tfhe::core_crypto::gpu::CudaStreams;
    use tfhe::integer::ciphertext::NoiseSquashingCompressionPrivateKey;
    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
-    use tfhe::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
+    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
    use tfhe::integer::gpu::ciphertext::{
        CudaCompressedSquashedNoiseCiphertextList, CudaUnsignedRadixCiphertext,
    };
    use tfhe::integer::gpu::gen_keys_radix_gpu;
-    use tfhe::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
+    use tfhe::shortint::parameters::CompressionParameters;
-    use tfhe::integer::noise_squashing::NoiseSquashingPrivateKey;
+    use tfhe::shortint::PBSParameters;
-    fn gpu_glwe_packing(c: &mut Criterion) {
+    #[derive(Clone)]
    struct BenchConfig {
        param: PBSParameters,
        comp_param: CompressionParameters,
        bit_size: usize,
        cks: ClientKey,
        private_compression_key: CompressionPrivateKeys,
    }
    fn get_num_elements_per_gpu(_bit_size: usize) -> usize {
        // 200 elements per GPU seems enough to saturate H100s
        // This is an empirical value and might need to be adjusted in the future
        200
    }
    fn execute_gpu_glwe_packing(c: &mut Criterion, config: BenchConfig) {
        let bench_name = "integer::cuda::packing_compression";
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -181,6 +191,279 @@ mod cuda {
        let stream = CudaStreams::new_multi_gpu();
        let BenchConfig {
            param,
            comp_param,
            bit_size,
            cks,
            private_compression_key,
        } = config;
        let log_message_modulus = param.message_modulus().0.ilog2() as usize;
        assert_eq!(bit_size % log_message_modulus, 0);
        let num_blocks = bit_size / log_message_modulus;
        let bench_id_pack;
        match get_bench_type() {
            BenchmarkType::Latency => {
                // Generate and convert compression keys
                let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
                let (compressed_compression_key, _) = radix_cks
                    .new_compressed_compression_decompression_keys(&private_compression_key);
                let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
                // Encrypt
                let ct = cks.encrypt_radix(0_u32, num_blocks);
                let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
                // Benchmark
                let mut builder = CudaCompressedCiphertextListBuilder::new();
                builder.push(d_ct, &stream);
                bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        let compressed = builder.build(&cuda_compression_key, &stream);
                        _ = black_box(compressed);
                    })
                });
            }
            BenchmarkType::Throughput => {
                // Generate and convert compression keys
                let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
                let (compressed_compression_key, _) = radix_cks
                    .new_compressed_compression_decompression_keys(&private_compression_key);
                let elements_per_gpu = get_num_elements_per_gpu(bit_size) as u64;
                let elements = elements_per_gpu * get_number_of_gpus() as u64;
                let num_block =
                    (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
                bench_group.throughput(Throughput::Elements(elements));
                // Encrypt
                let local_streams = cuda_local_streams(num_block, elements as usize);
                bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
                let cuda_compression_key_vec = (0..get_number_of_gpus())
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i as usize];
                        compressed_compression_key.decompress_to_cuda(local_stream)
                    })
                    .collect::<Vec<_>>();
                // Benchmark
                let builders = (0..elements)
                    .into_par_iter()
                    .map(|i| {
                        let ct = cks.encrypt_radix(0_u32, num_blocks);
                        let local_stream = &local_streams[i as usize % local_streams.len()];
                        let d_ct =
                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
                        let mut builder = CudaCompressedCiphertextListBuilder::new();
                        builder.push(d_ct, local_stream);
                        builder
                    })
                    .collect::<Vec<_>>();
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        builders.par_iter().enumerate().for_each(|(i, builder)| {
                            let local_stream = &local_streams[i % local_streams.len()];
                            let cuda_compression_key =
                                &cuda_compression_key_vec[i % get_number_of_gpus() as usize];
                            let _ = builder.build(cuda_compression_key, local_stream);
                        })
                    })
                });
            }
        }
        write_to_json::<u64, _>(
            &bench_id_pack,
            (comp_param, param.into()),
            comp_param.name(),
            "pack",
            &OperatorType::Atomic,
            bit_size as u32,
            vec![param.message_modulus().0.ilog2(); num_blocks],
        );
        bench_group.finish()
    }
    fn execute_gpu_glwe_unpacking(c: &mut Criterion, config: BenchConfig) {
        let bench_name = "integer::cuda::packing_compression";
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
            .sample_size(15)
            .measurement_time(std::time::Duration::from_secs(30));
        let stream = CudaStreams::new_multi_gpu();
        let BenchConfig {
            param,
            comp_param,
            bit_size,
            cks,
            private_compression_key,
        } = config;
        let log_message_modulus = param.message_modulus().0.ilog2() as usize;
        assert_eq!(bit_size % log_message_modulus, 0);
        let num_blocks = bit_size / log_message_modulus;
        let bench_id_unpack;
        match get_bench_type() {
            BenchmarkType::Latency => {
                // Generate and convert compression keys
                let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
                let (compressed_compression_key, compressed_decompression_key) = radix_cks
                    .new_compressed_compression_decompression_keys(&private_compression_key);
                let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
                let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
                    radix_cks.parameters().glwe_dimension(),
                    radix_cks.parameters().polynomial_size(),
                    radix_cks.parameters().message_modulus(),
                    radix_cks.parameters().carry_modulus(),
                    radix_cks.parameters().ciphertext_modulus(),
                    &stream,
                );
                // Encrypt
                let ct = cks.encrypt_radix(0_u32, num_blocks);
                let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
                // Benchmark
                let mut builder = CudaCompressedCiphertextListBuilder::new();
                builder.push(d_ct, &stream);
                let compressed = builder.build(&cuda_compression_key, &stream);
                bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        let unpacked: CudaUnsignedRadixCiphertext = compressed
                            .get(0, &cuda_decompression_key, &stream)
                            .unwrap()
                            .unwrap();
                        _ = black_box(unpacked);
                    })
                });
            }
            BenchmarkType::Throughput => {
                // Generate and convert compression keys
                let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
                let (compressed_compression_key, compressed_decompression_key) = radix_cks
                    .new_compressed_compression_decompression_keys(&private_compression_key);
                let elements_per_gpu = get_num_elements_per_gpu(bit_size) as u64;
                let elements = elements_per_gpu * get_number_of_gpus() as u64;
                let num_block =
                    (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
                bench_group.throughput(Throughput::Elements(elements));
                // Encrypt
                let local_streams = cuda_local_streams(num_block, elements as usize);
                bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
                let builders = (0..elements)
                    .into_par_iter()
                    .map(|i| {
                        let ct = cks.encrypt_radix(0_u32, num_blocks);
                        let local_stream = &local_streams[i as usize % local_streams.len()];
                        let d_ct =
                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
                        let mut builder = CudaCompressedCiphertextListBuilder::new();
                        builder.push(d_ct, local_stream);
                        builder
                    })
                    .collect::<Vec<_>>();
                let cuda_compression_key_vec = (0..get_number_of_gpus())
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i as usize];
                        compressed_compression_key.decompress_to_cuda(local_stream)
                    })
                    .collect::<Vec<_>>();
                let cuda_decompression_key_vec = (0..get_number_of_gpus())
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i as usize];
                        compressed_decompression_key.decompress_to_cuda(
                            radix_cks.parameters().glwe_dimension(),
                            radix_cks.parameters().polynomial_size(),
                            radix_cks.parameters().message_modulus(),
                            radix_cks.parameters().carry_modulus(),
                            radix_cks.parameters().ciphertext_modulus(),
                            local_stream,
                        )
                    })
                    .collect::<Vec<_>>();
                let compressed = builders
                    .par_iter()
                    .enumerate()
                    .map(|(i, builder)| {
                        let local_stream = &local_streams[i % local_streams.len()];
                        let cuda_compression_key =
                            &cuda_compression_key_vec[i % get_number_of_gpus() as usize];
                        builder.build(cuda_compression_key, local_stream)
                    })
                    .collect::<Vec<_>>();
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        compressed.par_iter().enumerate().for_each(|(i, comp)| {
                            let local_stream = &local_streams[i % local_streams.len()];
                            let cuda_decompression_key =
                                &cuda_decompression_key_vec[i % get_number_of_gpus() as usize];
                            let _ = comp
                                .get::<CudaUnsignedRadixCiphertext>(
                                    0,
                                    cuda_decompression_key,
                                    local_stream,
                                )
                                .unwrap()
                                .unwrap();
                        })
                    })
                });
            }
        }
        write_to_json::<u64, _>(
            &bench_id_unpack,
            (comp_param, param.into()),
            comp_param.name(),
            "unpack",
            &OperatorType::Atomic,
            bit_size as u32,
            vec![param.message_modulus().0.ilog2(); num_blocks],
        );
        bench_group.finish()
    }
    fn gpu_glwe_packing(c: &mut Criterion) {
        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let comp_param =
            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
@@ -190,6 +473,13 @@ mod cuda {
        let cks = ClientKey::new(param);
        let private_compression_key = cks.new_compression_private_key(comp_param);
        let mut config = BenchConfig {
            param: tfhe::shortint::PBSParameters::MultiBitPBS(param),
            comp_param,
            cks,
            private_compression_key,
            bit_size: 0,
        };
        for bit_size in [
            2,
            8,
@@ -200,218 +490,28 @@ mod cuda {
            256,
            comp_param.lwe_per_glwe().0 * log_message_modulus,
        ] {
-            assert_eq!(bit_size % log_message_modulus, 0);
+            config.bit_size = bit_size;
-            let num_blocks = bit_size / log_message_modulus;
+            execute_gpu_glwe_packing(c, config.clone());
            let bench_id_pack;
            let bench_id_unpack;
            // Generate and convert compression keys
            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
            let (compressed_compression_key, compressed_decompression_key) =
                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
            match get_bench_type() {
                BenchmarkType::Latency => {
                    let cuda_compression_key =
                        compressed_compression_key.decompress_to_cuda(&stream);
                    let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
                        radix_cks.parameters().glwe_dimension(),
                        radix_cks.parameters().polynomial_size(),
                        radix_cks.parameters().message_modulus(),
                        radix_cks.parameters().carry_modulus(),
                        radix_cks.parameters().ciphertext_modulus(),
                        &stream,
                    );
                    // Encrypt
                    let ct = cks.encrypt_radix(0_u32, num_blocks);
                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
                    // Benchmark
                    let mut builder = CudaCompressedCiphertextListBuilder::new();
                    builder.push(d_ct, &stream);
                    bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
                    bench_group.bench_function(&bench_id_pack, |b| {
                        b.iter(|| {
                            let compressed = builder.build(&cuda_compression_key, &stream);
                            _ = black_box(compressed);
                        })
                    });
                    let compressed = builder.build(&cuda_compression_key, &stream);
                    bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
                    bench_group.bench_function(&bench_id_unpack, |b| {
                        b.iter(|| {
                            let unpacked: CudaUnsignedRadixCiphertext = compressed
                                .get(0, &cuda_decompression_key, &stream)
                                .unwrap()
                                .unwrap();
                            _ = black_box(unpacked);
                        })
                    });
                }
                BenchmarkType::Throughput => {
                    // Execute the operation once to know its cost.
                    let (cpu_compression_key, cpu_decompression_key) =
                        cks.new_compression_decompression_keys(&private_compression_key);
                    let ct = cks.encrypt_radix(0_u32, num_blocks);
                    let mut builder = CompressedCiphertextListBuilder::new();
                    builder.push(ct);
                    let compressed = builder.build(&cpu_compression_key);
                    reset_pbs_count();
                    // Use CPU operation as pbs_count do not count PBS on GPU backend.
                    let _: RadixCiphertext =
                        compressed.get(0, &cpu_decompression_key).unwrap().unwrap();
                    let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
                    let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
                        .ceil() as usize;
                    let elements = throughput_num_threads(num_block, pbs_count);
                    bench_group.throughput(Throughput::Elements(elements));
                    // Encrypt
                    let local_streams = cuda_local_streams(num_block, elements as usize);
                    let cuda_compression_key_vec = local_streams
                        .iter()
                        .map(|local_stream| {
                            compressed_compression_key.decompress_to_cuda(local_stream)
                        })
                        .collect_vec();
                    let cuda_decompression_key_vec = local_streams
                        .iter()
                        .map(|local_stream| {
                            compressed_decompression_key.decompress_to_cuda(
                                radix_cks.parameters().glwe_dimension(),
                                radix_cks.parameters().polynomial_size(),
                                radix_cks.parameters().message_modulus(),
                                radix_cks.parameters().carry_modulus(),
                                radix_cks.parameters().ciphertext_modulus(),
                                local_stream,
                            )
                        })
                        .collect_vec();
                    // Benchmark
                    let builders = (0..elements)
                        .map(|i| {
                            let ct = cks.encrypt_radix(0_u32, num_blocks);
                            let local_stream = &local_streams[i as usize % local_streams.len()];
                            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
                                &ct,
                                local_stream,
                            );
                            let mut builder = CudaCompressedCiphertextListBuilder::new();
                            builder.push(d_ct, local_stream);
                            builder
                        })
                        .collect::<Vec<_>>();
                    bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
                    bench_group.bench_function(&bench_id_pack, |b| {
                        b.iter(|| {
                            builders.par_iter().enumerate().for_each(|(i, builder)| {
                                let local_stream = &local_streams[i % local_streams.len()];
                                let cuda_compression_key =
                                    &cuda_compression_key_vec[i % local_streams.len()];
                                builder.build(cuda_compression_key, local_stream);
                            })
                        })
                    });
                    let compressed = builders
                        .iter()
                        .enumerate()
                        .map(|(i, builder)| {
                            let local_stream = &local_streams[i % local_streams.len()];
                            let cuda_compression_key =
                                &cuda_compression_key_vec[i % local_streams.len()];
                            builder.build(cuda_compression_key, local_stream)
                        })
                        .collect::<Vec<_>>();
                    bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
                    bench_group.bench_function(&bench_id_unpack, |b| {
                        b.iter(|| {
                            compressed.par_iter().enumerate().for_each(|(i, comp)| {
                                let local_stream = &local_streams[i % local_streams.len()];
                                let cuda_decompression_key =
                                    &cuda_decompression_key_vec[i % local_streams.len()];
                                comp.get::<CudaUnsignedRadixCiphertext>(
                                    0,
                                    cuda_decompression_key,
                                    local_stream,
                                )
                                .unwrap()
                                .unwrap();
                            })
                        })
                    });
                }
            }
            write_to_json::<u64, _>(
                &bench_id_pack,
                (comp_param, param.into()),
                comp_param.name(),
                "pack",
                &OperatorType::Atomic,
                bit_size as u32,
                vec![param.message_modulus.0.ilog2(); num_blocks],
            );
            write_to_json::<u64, _>(
                &bench_id_unpack,
                (comp_param, param.into()),
                comp_param.name(),
                "unpack",
                &OperatorType::Atomic,
                bit_size as u32,
                vec![param.message_modulus.0.ilog2(); num_blocks],
            );
        }
        bench_group.finish()
    }
-    fn gpu_glwe_packing_128(c: &mut Criterion) {
+    fn gpu_glwe_unpacking(c: &mut Criterion) {
-        let bench_name = "integer::cuda::128b_packing_compression";
+        let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        let mut bench_group = c.benchmark_group(bench_name);
+        let comp_param =
-        bench_group
+            BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
            .sample_size(15)
            .measurement_time(std::time::Duration::from_secs(30));
        let stream = CudaStreams::new_multi_gpu();
        let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let noise_squashing_compression_parameters =
            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let noise_squashing_parameters =
            BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
-        let noise_squashing_compression_private_key =
+        let cks = ClientKey::new(param);
-            NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters);
+        let private_compression_key = cks.new_compression_private_key(comp_param);
        let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters);
        let noise_squashing_compression_key = noise_squashing_private_key
            .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
        let cuda_noise_squashing_compression_key =
            CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
                &noise_squashing_compression_key,
                &stream,
            );
        let mut config = BenchConfig {
            param: PBSParameters::MultiBitPBS(param),
            comp_param,
            bit_size: 0,
            cks,
            private_compression_key,
        };
        for bit_size in [
            2,
            8,
@@ -419,180 +519,31 @@ mod cuda {
            32,
            64,
            128,
-            // we don't need 256 here since
+            256,
-            // noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus == 256
+            comp_param.lwe_per_glwe().0 * log_message_modulus,
            // with current parameters 256,
            noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus,
        ] {
-            assert_eq!(bit_size % log_message_modulus, 0);
+            config.bit_size = bit_size;
-            let num_blocks = bit_size / log_message_modulus;
+            execute_gpu_glwe_unpacking(c, config.clone());
            let bench_id_pack;
            let bench_id_unpack;
            // Generate and convert compression keys
            let cks = ClientKey::new(param);
            let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream);
            let compressed_noise_squashing_compression_key =
                cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
            match get_bench_type() {
                BenchmarkType::Latency => {
                    let cuda_noise_squashing_key =
                        compressed_noise_squashing_compression_key.decompress_to_cuda(&stream);
                    // Encrypt
                    let ct = cks.encrypt_radix(0_u32, num_blocks);
                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
                    let d_ns_ct = cuda_noise_squashing_key
                        .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
                        .unwrap();
                    // Benchmark
                    let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
                    builder.push(d_ns_ct, &stream);
                    bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
                    bench_group.bench_function(&bench_id_pack, |b| {
                        b.iter(|| {
                            let compressed =
                                builder.build(&cuda_noise_squashing_compression_key, &stream);
                            _ = black_box(compressed);
                        })
                    });
                    let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream);
                    bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
                    bench_group.bench_function(&bench_id_unpack, |b| {
                        b.iter(|| {
                            let unpacked: CudaSquashedNoiseRadixCiphertext =
                                compressed.get(0, &stream).unwrap().unwrap();
                            _ = black_box(unpacked);
                        })
                    });
                }
                BenchmarkType::Throughput => {
                    let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
                        .ceil() as usize;
                    let elements = 100;
                    bench_group.throughput(Throughput::Elements(elements));
                    // Encrypt
                    let local_streams = cuda_local_streams(num_block, elements as usize);
                    let cuda_compression_key_vec = local_streams
                        .iter()
                        .map(|local_stream| {
                            compressed_noise_squashing_compression_key
                                .decompress_to_cuda(local_stream)
                        })
                        .collect_vec();
                    let cuda_noise_squashing_compression_key =
                        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
                            &noise_squashing_compression_key,
                            &stream,
                        );
                    // Benchmark
                    let builders = (0..elements)
                        .map(|i| {
                            let ct = cks.encrypt_radix(0_u32, num_blocks);
                            let local_stream = &local_streams[i as usize % local_streams.len()];
                            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
                                &ct,
                                local_stream,
                            );
                            let cuda_noise_squashing_key =
                                &cuda_compression_key_vec[(i as usize) % local_streams.len()];
                            let d_ns_ct = cuda_noise_squashing_key
                                .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
                                .unwrap();
                            let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
                            builder.push(d_ns_ct, local_stream);
                            builder
                        })
                        .collect::<Vec<_>>();
                    bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
                    bench_group.bench_function(&bench_id_pack, |b| {
                        b.iter(|| {
                            builders.par_iter().enumerate().for_each(|(i, builder)| {
                                let local_stream = &local_streams[i % local_streams.len()];
                                builder.build(&cuda_noise_squashing_compression_key, local_stream);
                            })
                        })
                    });
                    let compressed = builders
                        .iter()
                        .enumerate()
                        .map(|(i, builder)| {
                            let local_stream = &local_streams[i % local_streams.len()];
                            builder.build(&cuda_noise_squashing_compression_key, local_stream)
                        })
                        .collect::<Vec<_>>();
                    bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
                    bench_group.bench_function(&bench_id_unpack, |b| {
                        b.iter(|| {
                            compressed.par_iter().enumerate().for_each(|(i, comp)| {
                                let local_stream = &local_streams[i % local_streams.len()];
                                comp.get::<CudaSquashedNoiseRadixCiphertext>(0, local_stream)
                                    .unwrap()
                                    .unwrap();
                            })
                        })
                    });
                }
            }
            write_to_json::<u64, _>(
                &bench_id_pack,
                (noise_squashing_compression_parameters, param.into()),
                noise_squashing_compression_parameters.name(),
                "pack",
                &OperatorType::Atomic,
                bit_size as u32,
                vec![param.message_modulus.0.ilog2(); num_blocks],
            );
            write_to_json::<u64, _>(
                &bench_id_unpack,
                (noise_squashing_compression_parameters, param.into()),
                noise_squashing_compression_parameters.name(),
                "unpack",
                &OperatorType::Atomic,
                bit_size as u32,
                vec![param.message_modulus.0.ilog2(); num_blocks],
            );
        }
        bench_group.finish()
    }
    criterion_group!(gpu_glwe_packing2, gpu_glwe_packing);
-    criterion_group!(gpu_glwe_packing_128_2, gpu_glwe_packing_128);
+    criterion_group!(gpu_glwe_unpacking2, gpu_glwe_unpacking);
 }
 criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
 #[cfg(feature = "gpu")]
-use cuda::{gpu_glwe_packing2, gpu_glwe_packing_128_2};
+use cuda::gpu_glwe_packing2;
 #[cfg(feature = "gpu")]
 use cuda::gpu_glwe_unpacking2;
 fn main() {
    #[cfg(feature = "gpu")]
-    gpu_glwe_packing2();
+    {
-    #[cfg(feature = "gpu")]
+        gpu_glwe_packing2();
-    gpu_glwe_packing_128_2();
+        gpu_glwe_unpacking2();
    }
    #[cfg(not(feature = "gpu"))]
    cpu_glwe_packing2();
--- a/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs
@@ -0,0 +1,444 @@
 #[cfg(feature = "gpu")]
 mod cuda {
    use benchmark::params_aliases::*;
    use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
    use benchmark::utilities::{
        cuda_local_keys, get_bench_type, write_to_json, BenchmarkType, OperatorType,
    };
    use criterion::{black_box, criterion_group, Criterion, Throughput};
    use rayon::prelude::*;
    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
    use tfhe::integer::ciphertext::{
        NoiseSquashingCompressionKey, NoiseSquashingCompressionPrivateKey,
    };
    use tfhe::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
    use tfhe::integer::gpu::ciphertext::{
        CudaCompressedSquashedNoiseCiphertextList, CudaUnsignedRadixCiphertext,
    };
    use tfhe::integer::gpu::gen_keys_radix_gpu;
    use tfhe::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
    use tfhe::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
    use tfhe::integer::noise_squashing::{CompressedNoiseSquashingKey, NoiseSquashingPrivateKey};
    use tfhe::integer::ClientKey;
    use tfhe::keycache::NamedParam;
    use tfhe::shortint::parameters::NoiseSquashingCompressionParameters;
    use tfhe::shortint::PBSParameters;
    #[derive(Clone)]
    struct BenchConfig {
        param: PBSParameters,
        noise_squashing_compression_parameters: NoiseSquashingCompressionParameters,
        noise_squashing_compression_key: NoiseSquashingCompressionKey,
        compressed_noise_squashing_compression_key: CompressedNoiseSquashingKey,
        bit_size: usize,
        cks: ClientKey,
    }
    fn get_num_elements_per_gpu(_bit_size: usize) -> usize {
        // 200 elements per GPU seems enough to saturate H100s
        // This is an empirical value and might need to be adjusted in the future
        200
    }
    fn execute_gpu_glwe_packing_128(c: &mut Criterion, config: BenchConfig) {
        let bench_name = "integer::cuda::128b_packing_compression";
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
            .sample_size(15)
            .measurement_time(std::time::Duration::from_secs(30));
        let stream = CudaStreams::new_multi_gpu();
        let BenchConfig {
            param,
            noise_squashing_compression_parameters,
            noise_squashing_compression_key,
            compressed_noise_squashing_compression_key,
            bit_size,
            cks,
        } = config;
        let log_message_modulus = param.message_modulus().0.ilog2() as usize;
        assert_eq!(bit_size % log_message_modulus, 0);
        let num_blocks = bit_size / log_message_modulus;
        let bench_id_pack;
        match get_bench_type() {
            BenchmarkType::Latency => {
                let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream);
                let cuda_noise_squashing_key =
                    compressed_noise_squashing_compression_key.decompress_to_cuda(&stream);
                // Encrypt
                let ct = cks.encrypt_radix(0_u32, num_blocks);
                let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
                let d_ns_ct = cuda_noise_squashing_key
                    .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
                    .unwrap();
                let cuda_noise_squashing_compression_key =
                    CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
                        &noise_squashing_compression_key,
                        &stream,
                    );
                // Benchmark
                let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
                builder.push(d_ns_ct, &stream);
                bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        let compressed =
                            builder.build(&cuda_noise_squashing_compression_key, &stream);
                        _ = black_box(compressed);
                    })
                });
            }
            BenchmarkType::Throughput => {
                let cuda_sks = cuda_local_keys(&cks);
                let num_block =
                    (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
                let elements = get_num_elements_per_gpu(bit_size) as u64;
                bench_group.throughput(Throughput::Elements(elements));
                // Encrypt
                let local_streams = cuda_local_streams(num_block, elements as usize);
                let num_gpus = get_number_of_gpus() as usize;
                let cuda_compression_key_vec: Vec<CudaNoiseSquashingKey> = (0..num_gpus)
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i % local_streams.len()];
                        compressed_noise_squashing_compression_key.decompress_to_cuda(local_stream)
                    })
                    .collect();
                let cuda_noise_squashing_compression_key_vec: Vec<
                    CudaNoiseSquashingCompressionKey,
                > = (0..num_gpus)
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i % local_streams.len()];
                        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
                            &noise_squashing_compression_key,
                            local_stream,
                        )
                    })
                    .collect();
                // Benchmark
                let builders = (0..elements)
                    .into_par_iter()
                    .map(|i| {
                        let ct = cks.encrypt_radix(0_u32, num_blocks);
                        let local_stream = &local_streams[i as usize % local_streams.len()];
                        let d_ct =
                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
                        let cuda_noise_squashing_key =
                            &cuda_compression_key_vec[(i as usize) % num_gpus];
                        let cuda_noise_squashing_compression_key =
                            &cuda_noise_squashing_compression_key_vec[(i as usize) % num_gpus];
                        let d_ns_ct = cuda_noise_squashing_key
                            .squash_radix_ciphertext_noise(
                                &cuda_sks[(i as usize) % num_gpus],
                                &d_ct.ciphertext,
                                local_stream,
                            )
                            .unwrap();
                        let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
                        builder.push(d_ns_ct, local_stream);
                        (builder, cuda_noise_squashing_compression_key, local_stream)
                    })
                    .collect::<Vec<_>>();
                bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
                bench_group.bench_function(&bench_id_pack, |b| {
                    b.iter(|| {
                        builders.par_iter().for_each(
                            |(builder, cuda_noise_squashing_compression_key, local_stream)| {
                                builder.build(cuda_noise_squashing_compression_key, local_stream);
                            },
                        )
                    })
                });
            }
        }
        write_to_json::<u64, _>(
            &bench_id_pack,
            (noise_squashing_compression_parameters, param.into()),
            noise_squashing_compression_parameters.name(),
            "pack",
            &OperatorType::Atomic,
            bit_size as u32,
            vec![param.message_modulus().0.ilog2(); num_blocks],
        );
        bench_group.finish()
    }
    fn execute_gpu_glwe_unpacking_128(c: &mut Criterion, config: BenchConfig) {
        let bench_name = "integer::cuda::128b_packing_compression";
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
            .sample_size(15)
            .measurement_time(std::time::Duration::from_secs(30));
        let stream = CudaStreams::new_multi_gpu();
        let BenchConfig {
            param,
            noise_squashing_compression_parameters,
            noise_squashing_compression_key,
            compressed_noise_squashing_compression_key,
            bit_size,
            cks,
        } = config;
        let log_message_modulus = param.message_modulus().0.ilog2() as usize;
        assert_eq!(bit_size % log_message_modulus, 0);
        let num_blocks = bit_size / log_message_modulus;
        let bench_id_unpack;
        match get_bench_type() {
            BenchmarkType::Latency => {
                let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream);
                let cuda_noise_squashing_key =
                    compressed_noise_squashing_compression_key.decompress_to_cuda(&stream);
                // Encrypt
                let ct = cks.encrypt_radix(0_u32, num_blocks);
                let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
                let d_ns_ct = cuda_noise_squashing_key
                    .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
                    .unwrap();
                let cuda_noise_squashing_compression_key =
                    CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
                        &noise_squashing_compression_key,
                        &stream,
                    );
                // Benchmark
                let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
                builder.push(d_ns_ct, &stream);
                let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream);
                bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        let unpacked: CudaSquashedNoiseRadixCiphertext =
                            compressed.get(0, &stream).unwrap().unwrap();
                        _ = black_box(unpacked);
                    })
                });
            }
            BenchmarkType::Throughput => {
                let cuda_sks = cuda_local_keys(&cks);
                let num_block =
                    (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
                let elements = get_num_elements_per_gpu(bit_size) as u64;
                bench_group.throughput(Throughput::Elements(elements));
                // Encrypt
                let local_streams = cuda_local_streams(num_block, elements as usize);
                let num_gpus = get_number_of_gpus() as usize;
                let cuda_compression_key_vec: Vec<CudaNoiseSquashingKey> = (0..num_gpus)
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i % local_streams.len()];
                        compressed_noise_squashing_compression_key.decompress_to_cuda(local_stream)
                    })
                    .collect();
                let cuda_noise_squashing_compression_key_vec: Vec<
                    CudaNoiseSquashingCompressionKey,
                > = (0..num_gpus)
                    .into_par_iter()
                    .map(|i| {
                        let local_stream = &local_streams[i % local_streams.len()];
                        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
                            &noise_squashing_compression_key,
                            local_stream,
                        )
                    })
                    .collect();
                // Benchmark
                let builders = (0..elements)
                    .into_par_iter()
                    .map(|i| {
                        let ct = cks.encrypt_radix(0_u32, num_blocks);
                        let local_stream = &local_streams[i as usize % local_streams.len()];
                        let d_ct =
                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
                        let cuda_noise_squashing_key =
                            &cuda_compression_key_vec[(i as usize) % num_gpus];
                        let cuda_noise_squashing_compression_key =
                            &cuda_noise_squashing_compression_key_vec[(i as usize) % num_gpus];
                        let d_ns_ct = cuda_noise_squashing_key
                            .squash_radix_ciphertext_noise(
                                &cuda_sks[(i as usize) % num_gpus],
                                &d_ct.ciphertext,
                                local_stream,
                            )
                            .unwrap();
                        let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
                        builder.push(d_ns_ct, local_stream);
                        (builder, cuda_noise_squashing_compression_key, local_stream)
                    })
                    .collect::<Vec<_>>();
                let compressed = builders
                    .into_par_iter()
                    .map(
                        |(builder, cuda_noise_squashing_compression_key, local_stream)| {
                            builder.build(cuda_noise_squashing_compression_key, local_stream)
                        },
                    )
                    .collect::<Vec<_>>();
                bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
                bench_group.bench_function(&bench_id_unpack, |b| {
                    b.iter(|| {
                        compressed.par_iter().enumerate().for_each(|(i, comp)| {
                            let local_stream = &local_streams[i % local_streams.len()];
                            comp.get::<CudaSquashedNoiseRadixCiphertext>(0, local_stream)
                                .unwrap()
                                .unwrap();
                        })
                    })
                });
            }
        }
        write_to_json::<u64, _>(
            &bench_id_unpack,
            (noise_squashing_compression_parameters, param.into()),
            noise_squashing_compression_parameters.name(),
            "unpack",
            &OperatorType::Atomic,
            bit_size as u32,
            vec![param.message_modulus().0.ilog2(); num_blocks],
        );
        bench_group.finish()
    }
    fn gpu_glwe_packing_128(c: &mut Criterion) {
        let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let noise_squashing_compression_parameters =
            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let noise_squashing_parameters =
            BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
        let cks = ClientKey::new(param);
        let noise_squashing_compression_private_key =
            NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters);
        let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters);
        let noise_squashing_compression_key = noise_squashing_private_key
            .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
        // Generate and convert compression keys
        let compressed_noise_squashing_compression_key =
            cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
        let mut config = BenchConfig {
            param: PBSParameters::PBS(param),
            noise_squashing_compression_key,
            noise_squashing_compression_parameters,
            compressed_noise_squashing_compression_key,
            bit_size: 0,
            cks,
        };
        for bit_size in [
            2,
            8,
            16,
            32,
            64,
            128,
            256,
            noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus,
        ] {
            config.bit_size = bit_size;
            execute_gpu_glwe_packing_128(c, config.clone());
        }
    }
    fn gpu_glwe_unpacking_128(c: &mut Criterion) {
        let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let noise_squashing_compression_parameters =
            BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let noise_squashing_parameters =
            BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
        let cks = ClientKey::new(param);
        let noise_squashing_compression_private_key =
            NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters);
        let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters);
        let noise_squashing_compression_key = noise_squashing_private_key
            .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
        // Generate and convert compression keys
        let compressed_noise_squashing_compression_key =
            cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
        let mut config = BenchConfig {
            param: PBSParameters::PBS(param),
            noise_squashing_compression_key,
            noise_squashing_compression_parameters,
            compressed_noise_squashing_compression_key,
            bit_size: 0,
            cks,
        };
        for bit_size in [
            2,
            8,
            16,
            32,
            64,
            128,
            256,
            noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus,
        ] {
            config.bit_size = bit_size;
            execute_gpu_glwe_unpacking_128(c, config.clone());
        }
    }
    criterion_group!(gpu_glwe_packing_128_2, gpu_glwe_packing_128);
    criterion_group!(gpu_glwe_unpacking_128_2, gpu_glwe_unpacking_128);
 }
 use criterion::Criterion;
 #[cfg(feature = "gpu")]
 use cuda::gpu_glwe_packing_128_2;
 #[cfg(feature = "gpu")]
 use cuda::gpu_glwe_unpacking_128_2;
 fn main() {
    #[cfg(feature = "gpu")]
    gpu_glwe_packing_128_2();
    #[cfg(feature = "gpu")]
    gpu_glwe_unpacking_128_2();
    Criterion::default().configure_from_args().final_summary();
 }