tfhe-rs/tfhe-benchmark/benches/core_crypto/ks_bench.rs

#[cfg(feature = "boolean")]
use benchmark::params::benchmark_32bits_parameters;
use benchmark::params::{
    benchmark_compression_parameters, benchmark_parameters, multi_bit_benchmark_parameters,
};
use benchmark::utilities::{
    get_bench_type, get_param_type, throughput_num_threads, write_to_json, BenchmarkType,
    CryptoParametersRecord, OperatorType, ParamType,
};
use criterion::{black_box, Criterion, Throughput};
use itertools::Itertools;
use rayon::prelude::*;
use serde::Serialize;
use tfhe::core_crypto::prelude::*;

// TODO Refactor KS, PBS and KS-PBS benchmarks into a single generic function.
fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
    criterion: &mut Criterion,
    parameters: &[(String, CryptoParametersRecord<Scalar>)],
) {
    let bench_name = "core_crypto::keyswitch";
    let mut bench_group = criterion.benchmark_group(bench_name);

    // Create the PRNG
    let mut seeder = new_seeder();
    let seeder = seeder.as_mut();
    let mut encryption_generator =
        EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
    let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());

    for (name, params) in parameters.iter() {
        let lwe_dimension = params.lwe_dimension.unwrap();
        let glwe_dimension = params.glwe_dimension.unwrap();
        let polynomial_size = params.polynomial_size.unwrap();
        let ks_decomp_base_log = params.ks_base_log.unwrap();
        let ks_decomp_level_count = params.ks_level.unwrap();

        let lwe_sk =
            allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);

        let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
            glwe_dimension,
            polynomial_size,
            &mut secret_generator,
        );
        let big_lwe_sk = glwe_sk.into_lwe_secret_key();
        let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
            &big_lwe_sk,
            &lwe_sk,
            ks_decomp_base_log,
            ks_decomp_level_count,
            params.lwe_noise_distribution.unwrap(),
            params.ciphertext_modulus.unwrap(),
            &mut encryption_generator,
        );

        let bench_id;

        match get_bench_type() {
            BenchmarkType::Latency => {
                let ct = allocate_and_encrypt_new_lwe_ciphertext(
                    &big_lwe_sk,
                    Plaintext(Scalar::ONE),
                    params.lwe_noise_distribution.unwrap(),
                    params.ciphertext_modulus.unwrap(),
                    &mut encryption_generator,
                );

                let mut output_ct = LweCiphertext::new(
                    Scalar::ZERO,
                    lwe_sk.lwe_dimension().to_lwe_size(),
                    params.ciphertext_modulus.unwrap(),
                );

                bench_id = format!("{bench_name}::{name}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
                            keyswitch_lwe_ciphertext(&ksk_big_to_small, &ct, &mut output_ct);
                            black_box(&mut output_ct);
                        })
                    });
                }
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1); // FIXME This number of element do not staturate the target machine
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
                    let setup_encrypted_values = || {
                        let input_cts = (0..elements)
                            .map(|_| {
                                allocate_and_encrypt_new_lwe_ciphertext(
                                    &big_lwe_sk,
                                    Plaintext(Scalar::ONE),
                                    params.lwe_noise_distribution.unwrap(),
                                    params.ciphertext_modulus.unwrap(),
                                    &mut encryption_generator,
                                )
                            })
                            .collect::<Vec<_>>();

                        let output_cts = (0..elements)
                            .map(|_| {
                                LweCiphertext::new(
                                    Scalar::ZERO,
                                    lwe_sk.lwe_dimension().to_lwe_size(),
                                    params.ciphertext_modulus.unwrap(),
                                )
                            })
                            .collect::<Vec<_>>();

                        (input_cts, output_cts)
                    };

                    b.iter_batched(
                        setup_encrypted_values,
                        |(input_cts, mut output_cts)| {
                            input_cts
                                .par_iter()
                                .zip(output_cts.par_iter_mut())
                                .for_each(|(input_ct, output_ct)| {
                                    keyswitch_lwe_ciphertext(
                                        &ksk_big_to_small,
                                        input_ct,
                                        output_ct,
                                    );
                                })
                        },
                        criterion::BatchSize::SmallInput,
                    )
                });
            }
        };

        let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
        write_to_json(
            &bench_id,
            *params,
            name,
            "ks",
            &OperatorType::Atomic,
            bit_size,
            vec![bit_size],
        );
    }
}

fn packing_keyswitch<Scalar, F>(
    criterion: &mut Criterion,
    bench_name: &str,
    parameters: &[(String, CryptoParametersRecord<Scalar>)],
    ks_op: F,
) where
    Scalar: UnsignedTorus + CastInto<usize> + Serialize,
    F: Fn(
            &LwePackingKeyswitchKey<Vec<Scalar>>,
            &LweCiphertextList<Vec<Scalar>>,
            &mut GlweCiphertext<Vec<Scalar>>,
        ) + Sync
        + Send,
{
    let bench_name = format!("core_crypto::{bench_name}");
    let mut bench_group = criterion.benchmark_group(&bench_name);

    // Create the PRNG
    let mut seeder = new_seeder();
    let seeder = seeder.as_mut();
    let mut encryption_generator =
        EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
    let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());

    for (name, params) in parameters.iter() {
        let lwe_dimension = params.lwe_dimension.unwrap();
        let packing_glwe_dimension = params.packing_ks_glwe_dimension.unwrap();
        let packing_polynomial_size = params.packing_ks_polynomial_size.unwrap();
        let packing_ks_decomp_base_log = params.packing_ks_base_log.unwrap();
        let packing_ks_decomp_level_count = params.packing_ks_level.unwrap();
        let ciphertext_modulus = params.ciphertext_modulus.unwrap();
        let count = params.lwe_per_glwe.unwrap();

        let lwe_sk =
            allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);

        let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
            packing_glwe_dimension,
            packing_polynomial_size,
            &mut secret_generator,
        );

        let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
            &lwe_sk,
            &glwe_sk,
            packing_ks_decomp_base_log,
            packing_ks_decomp_level_count,
            params.packing_ks_key_noise_distribution.unwrap(),
            ciphertext_modulus,
            &mut encryption_generator,
        );

        let bench_id;

        match get_bench_type() {
            BenchmarkType::Latency => {
                let mut input_lwe_list = LweCiphertextList::new(
                    Scalar::ZERO,
                    lwe_sk.lwe_dimension().to_lwe_size(),
                    count,
                    ciphertext_modulus,
                );

                let plaintext_list = PlaintextList::new(
                    Scalar::ZERO,
                    PlaintextCount(input_lwe_list.lwe_ciphertext_count().0),
                );

                encrypt_lwe_ciphertext_list(
                    &lwe_sk,
                    &mut input_lwe_list,
                    &plaintext_list,
                    params.lwe_noise_distribution.unwrap(),
                    &mut encryption_generator,
                );

                let mut output_glwe = GlweCiphertext::new(
                    Scalar::ZERO,
                    glwe_sk.glwe_dimension().to_glwe_size(),
                    glwe_sk.polynomial_size(),
                    ciphertext_modulus,
                );

                bench_id = format!("{bench_name}::{name}");
                {
                    bench_group.bench_function(&bench_id, |b| {
                        b.iter(|| {
                            ks_op(&pksk, &input_lwe_list, &mut output_glwe);
                            black_box(&mut output_glwe);
                        })
                    });
                }
            }
            BenchmarkType::Throughput => {
                bench_id = format!("{bench_name}::throughput::{name}");
                let blocks: usize = 1;
                let elements = throughput_num_threads(blocks, 1);
                bench_group.throughput(Throughput::Elements(elements));
                bench_group.bench_function(&bench_id, |b| {
                    let setup_encrypted_values = || {
                        let input_lwe_lists = (0..elements)
                            .map(|_| {
                                let mut input_lwe_list = LweCiphertextList::new(
                                    Scalar::ZERO,
                                    lwe_sk.lwe_dimension().to_lwe_size(),
                                    count,
                                    ciphertext_modulus,
                                );

                                let plaintext_list = PlaintextList::new(
                                    Scalar::ZERO,
                                    PlaintextCount(input_lwe_list.lwe_ciphertext_count().0),
                                );

                                encrypt_lwe_ciphertext_list(
                                    &lwe_sk,
                                    &mut input_lwe_list,
                                    &plaintext_list,
                                    params.lwe_noise_distribution.unwrap(),
                                    &mut encryption_generator,
                                );

                                input_lwe_list
                            })
                            .collect::<Vec<_>>();

                        let output_glwes = (0..elements)
                            .map(|_| {
                                GlweCiphertext::new(
                                    Scalar::ZERO,
                                    glwe_sk.glwe_dimension().to_glwe_size(),
                                    glwe_sk.polynomial_size(),
                                    ciphertext_modulus,
                                )
                            })
                            .collect::<Vec<_>>();

                        (input_lwe_lists, output_glwes)
                    };

                    b.iter_batched(
                        setup_encrypted_values,
                        |(input_lwe_lists, mut output_glwes)| {
                            input_lwe_lists
                                .par_iter()
                                .zip(output_glwes.par_iter_mut())
                                .for_each(|(input_lwe_list, output_glwe)| {
                                    ks_op(&pksk, input_lwe_list, output_glwe);
                                })
                        },
                        criterion::BatchSize::SmallInput,
                    )
                });
            }
        };

        let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
        write_to_json(
            &bench_id,
            *params,
            name,
            "packing_ks",
            &OperatorType::Atomic,
            bit_size,
            vec![bit_size],
        );
    }
}

#[cfg(feature = "gpu")]
mod cuda {
    use benchmark::params::{benchmark_parameters, multi_bit_benchmark_parameters};
    use benchmark::utilities::{
        cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
        CudaLocalKeys, OperatorType,
    };
    use criterion::{black_box, Criterion, Throughput};
    use itertools::Itertools;
    use rayon::prelude::*;
    use serde::Serialize;
    use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
    use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
    use tfhe::core_crypto::gpu::vec::GpuIndex;
    use tfhe::core_crypto::gpu::{
        check_valid_cuda_malloc, cuda_keyswitch_lwe_ciphertext,
        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64, get_number_of_gpus,
        get_packing_keyswitch_list_64_size_on_gpu, CudaStreams,
    };

    use tfhe::core_crypto::prelude::*;

    fn cuda_keyswitch_classical_and_gemm<
        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
        KeyswitchScalar: UnsignedTorus + CastFrom<Scalar>,
    >(
        criterion: &mut Criterion,
        parameters: &[(String, CryptoParametersRecord<Scalar>)],
    ) {
        let bench_name = "core_crypto::cuda::keyswitch";
        let mut bench_group = criterion.benchmark_group(bench_name);

        // Create the PRNG
        let mut seeder = new_seeder();
        let seeder = seeder.as_mut();
        let mut encryption_generator =
            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
        let mut secret_generator =
            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());

        for (name, params) in parameters.iter() {
            let lwe_dimension = params.lwe_dimension.unwrap();
            let glwe_dimension = params.glwe_dimension.unwrap();
            let polynomial_size = params.polynomial_size.unwrap();
            let ks_decomp_base_log = params.ks_base_log.unwrap();
            let ks_decomp_level_count = params.ks_level.unwrap();

            let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
                DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
                    DynamicDistribution::<KeyswitchScalar>::new_gaussian(
                        gaussian_lwe_noise_distribution.standard_dev(),
                    )
                }
                DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
                    DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
                        match KeyswitchScalar::BITS {
                            32 => uniform_lwe_noise_distribution.bound_log2() - 32,
                            64 => uniform_lwe_noise_distribution.bound_log2(),
                            _ => panic!("Unsupported Keyswitch scalar input dtype"),
                        },
                    )
                }
            };

            let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
                allocate_and_generate_new_binary_lwe_secret_key(
                    lwe_dimension,
                    &mut secret_generator,
                );

            let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
                allocate_and_generate_new_binary_glwe_secret_key(
                    glwe_dimension,
                    polynomial_size,
                    &mut secret_generator,
                );
            let big_lwe_sk = glwe_sk.into_lwe_secret_key();

            let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
                &big_lwe_sk,
                &lwe_sk,
                ks_decomp_base_log,
                ks_decomp_level_count,
                lwe_noise_distribution_ksk,
                CiphertextModulus::new_native(),
                &mut encryption_generator,
            );

            let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
                allocate_and_generate_new_binary_glwe_secret_key(
                    glwe_dimension,
                    polynomial_size,
                    &mut secret_generator,
                );

            let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
            let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();

            let cpu_keys: CpuKeys<Scalar, KeyswitchScalar> = CpuKeysBuilder::new()
                .keyswitch_key(ksk_big_to_small)
                .build();

            let mut bench_id;

            match get_bench_type() {
                BenchmarkType::Latency => {
                    let streams = CudaStreams::new_multi_gpu();
                    let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);

                    let ct = allocate_and_encrypt_new_lwe_ciphertext(
                        &big_lwe_sk_64,
                        Plaintext(Scalar::ONE),
                        params.lwe_noise_distribution.unwrap(),
                        CiphertextModulus::new_native(),
                        &mut encryption_generator,
                    );
                    let mut ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &streams);

                    let output_ct = LweCiphertext::new(
                        KeyswitchScalar::ZERO,
                        lwe_sk.lwe_dimension().to_lwe_size(),
                        CiphertextModulus::new_native(),
                    );
                    let mut output_ct_gpu =
                        CudaLweCiphertextList::from_lwe_ciphertext(&output_ct, &streams);

                    let h_indexes = [Scalar::ZERO];
                    let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);

                    bench_id = format!(
                        "{bench_name}::latency::{:?}b::{name}",
                        KeyswitchScalar::BITS
                    );
                    {
                        bench_group.bench_function(&bench_id, |b| {
                            b.iter(|| {
                                cuda_keyswitch_lwe_ciphertext(
                                    gpu_keys.ksk.as_ref().unwrap(),
                                    &ct_gpu,
                                    &mut output_ct_gpu,
                                    &cuda_indexes.d_input,
                                    &cuda_indexes.d_output,
                                    true,
                                    &streams,
                                    false,
                                );

                                black_box(&mut ct_gpu);
                            })
                        });
                    }

                    let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
                    write_to_json(
                        &bench_id,
                        *params,
                        name,
                        "ks",
                        &OperatorType::Atomic,
                        bit_size,
                        vec![bit_size],
                    );
                }
                BenchmarkType::Throughput => {
                    let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
                    let gpu_count = get_number_of_gpus() as usize;

                    for uses_gemm_ks in [false, true] {
                        for uses_trivial_indices in [false, true] {
                            let indices_str = if uses_trivial_indices {
                                "trivial"
                            } else {
                                "complex"
                            };
                            let gemm_str = if uses_gemm_ks { "gemm" } else { "classical" };
                            bench_id = format!(
                                "{bench_name}::throughput::{:?}b::{gemm_str}::{indices_str}_indices::{name}",
                                KeyswitchScalar::BITS
                            );

                            let blocks: usize = 256;
                            let elements = gpu_count * blocks;
                            let elements_per_stream = elements / gpu_count;
                            bench_group.throughput(Throughput::Elements(elements as u64));
                            bench_group.sample_size(50);
                            bench_group.bench_function(&bench_id, |b| {
                                let setup_encrypted_values = || {
                                    let local_streams = cuda_local_streams_core();

                                    let plaintext_list = PlaintextList::new(
                                        Scalar::ZERO,
                                        PlaintextCount(elements_per_stream),
                                    );

                                    let input_cts = (0..gpu_count)
                                        .map(|i| {
                                            let mut input_ct_list = LweCiphertextList::new(
                                                Scalar::ZERO,
                                                big_lwe_sk.lwe_dimension().to_lwe_size(),
                                                LweCiphertextCount(elements_per_stream),
                                                params.ciphertext_modulus.unwrap(),
                                            );
                                            encrypt_lwe_ciphertext_list(
                                                &big_lwe_sk_64,
                                                &mut input_ct_list,
                                                &plaintext_list,
                                                params.lwe_noise_distribution.unwrap(),
                                                &mut encryption_generator,
                                            );
                                            let input_ks_list = LweCiphertextList::from_container(
                                                input_ct_list.into_container(),
                                                big_lwe_sk.lwe_dimension().to_lwe_size(),
                                                params.ciphertext_modulus.unwrap(),
                                            );
                                            CudaLweCiphertextList::from_lwe_ciphertext_list(
                                                &input_ks_list,
                                                &local_streams[i],
                                            )
                                        })
                                        .collect::<Vec<_>>();

                                    let output_cts = (0..gpu_count)
                                        .map(|i| {
                                            let output_ct_list = LweCiphertextList::new(
                                                KeyswitchScalar::ZERO,
                                                lwe_sk.lwe_dimension().to_lwe_size(),
                                                LweCiphertextCount(elements_per_stream),
                                                ciphertext_modulus_out,
                                            );
                                            CudaLweCiphertextList::from_lwe_ciphertext_list(
                                                &output_ct_list,
                                                &local_streams[i],
                                            )
                                        })
                                        .collect::<Vec<_>>();

                                    let indexes_range: Vec<u64> = if uses_trivial_indices {
                                        (0..(elements / gpu_count) as u64).collect()
                                    } else {
                                        (0..(elements / gpu_count) as u64).rev().collect()
                                    };
                                    let h_indexes = indexes_range
                                        .iter()
                                        .map(|v| CastFrom::cast_from(*v))
                                        .collect::<Vec<_>>();

                                    let cuda_indexes_vec = (0..gpu_count)
                                        .map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
                                        .collect::<Vec<_>>();
                                    local_streams.iter().for_each(|stream| stream.synchronize());

                                    (input_cts, output_cts, cuda_indexes_vec, local_streams)
                                };

                                b.iter_batched(
                                    setup_encrypted_values,
                                    |(
                                        input_cts,
                                        mut output_cts,
                                        cuda_indexes_vec,
                                        local_streams,
                                    )| {
                                        (0..gpu_count)
                                            .into_par_iter()
                                            .zip(input_cts.par_iter())
                                            .zip(output_cts.par_iter_mut())
                                            .zip(local_streams.par_iter())
                                            .for_each(
                                                |(((i, input_ct), output_ct), local_stream)| {
                                                    cuda_keyswitch_lwe_ciphertext(
                                                        gpu_keys_vec[i].ksk.as_ref().unwrap(),
                                                        input_ct,
                                                        output_ct,
                                                        &cuda_indexes_vec[i].d_input,
                                                        &cuda_indexes_vec[i].d_output,
                                                        uses_trivial_indices,
                                                        local_stream,
                                                        uses_gemm_ks,
                                                    );
                                                },
                                            )
                                    },
                                    criterion::BatchSize::SmallInput,
                                )
                            });

                            let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
                            write_to_json(
                                &bench_id,
                                *params,
                                name,
                                "ks",
                                &OperatorType::Atomic,
                                bit_size,
                                vec![bit_size],
                            );
                        }
                    }
                }
            };
        }
    }

    fn cuda_packing_keyswitch<
        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize + CastInto<u32>,
    >(
        criterion: &mut Criterion,
        parameters: &[(String, CryptoParametersRecord<Scalar>)],
    ) {
        let bench_name = "core_crypto::cuda::packing_keyswitch";
        let mut bench_group = criterion.benchmark_group(bench_name);

        // Create the PRNG
        let mut seeder = new_seeder();
        let seeder = seeder.as_mut();
        let mut encryption_generator =
            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
        let mut secret_generator =
            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());

        for (name, params) in parameters.iter() {
            let lwe_dimension = params.lwe_dimension.unwrap();
            let glwe_dimension = params.glwe_dimension.unwrap();
            let polynomial_size = params.polynomial_size.unwrap();
            let ks_decomp_base_log = params.ks_base_log.unwrap();
            let ks_decomp_level_count = params.ks_level.unwrap();
            let glwe_noise_distribution = params.glwe_noise_distribution.unwrap();
            let ciphertext_modulus = params.ciphertext_modulus.unwrap();

            let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
                lwe_dimension,
                &mut secret_generator,
            );

            let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
                glwe_dimension,
                polynomial_size,
                &mut secret_generator,
            );

            let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
                &lwe_sk,
                &glwe_sk,
                ks_decomp_base_log,
                ks_decomp_level_count,
                glwe_noise_distribution,
                ciphertext_modulus,
                &mut encryption_generator,
            );

            let cpu_keys: CpuKeys<Scalar, Scalar> =
                CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();

            let bench_id;
            match get_bench_type() {
                BenchmarkType::Latency => {
                    let streams = CudaStreams::new_multi_gpu();

                    let mem_size = get_packing_keyswitch_list_64_size_on_gpu(
                        &streams,
                        lwe_sk.lwe_dimension(),
                        glwe_sk.glwe_dimension(),
                        glwe_sk.polynomial_size(),
                        LweCiphertextCount(glwe_sk.polynomial_size().0),
                    );

                    let skip_bench = !check_valid_cuda_malloc(mem_size, GpuIndex::new(0));

                    if skip_bench {
                        continue;
                    }

                    let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);

                    let mut input_ct_list = LweCiphertextList::new(
                        Scalar::ZERO,
                        lwe_sk.lwe_dimension().to_lwe_size(),
                        LweCiphertextCount(glwe_sk.polynomial_size().0),
                        ciphertext_modulus,
                    );

                    let plaintext_list = PlaintextList::new(
                        Scalar::ZERO,
                        PlaintextCount(input_ct_list.lwe_ciphertext_count().0),
                    );

                    encrypt_lwe_ciphertext_list(
                        &lwe_sk,
                        &mut input_ct_list,
                        &plaintext_list,
                        params.lwe_noise_distribution.unwrap(),
                        &mut encryption_generator,
                    );

                    let mut d_input_lwe_list =
                        CudaLweCiphertextList::from_lwe_ciphertext_list(&input_ct_list, &streams);

                    let mut d_output_glwe = CudaGlweCiphertextList::new(
                        glwe_sk.glwe_dimension(),
                        glwe_sk.polynomial_size(),
                        GlweCiphertextCount(1),
                        ciphertext_modulus,
                        &streams,
                    );

                    streams.synchronize();

                    bench_id = format!("{bench_name}::{name}");
                    {
                        bench_group.bench_function(&bench_id, |b| {
                            b.iter(|| {
                                cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
                                    gpu_keys.pksk.as_ref().unwrap(),
                                    &d_input_lwe_list,
                                    &mut d_output_glwe,
                                    &streams,
                                );
                                black_box(&mut d_input_lwe_list);
                            })
                        });
                    }
                }
                BenchmarkType::Throughput => {
                    let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
                    let gpu_count = get_number_of_gpus() as usize;

                    bench_id = format!("{bench_name}::throughput::{name}");

                    let mem_size = get_packing_keyswitch_list_64_size_on_gpu(
                        &CudaStreams::new_single_gpu(GpuIndex::new(0)),
                        lwe_sk.lwe_dimension(),
                        glwe_sk.glwe_dimension(),
                        glwe_sk.polynomial_size(),
                        LweCiphertextCount(glwe_sk.polynomial_size().0),
                    );

                    let mut skip_test = false;
                    for gpu_index in 0..gpu_count {
                        if !check_valid_cuda_malloc(mem_size, GpuIndex::new(gpu_index as u32)) {
                            skip_test = true;
                        }
                    }

                    if skip_test {
                        continue;
                    }

                    let blocks: usize = 1;
                    let elements = throughput_num_threads(blocks, 1);
                    let elements_per_stream =
                        std::cmp::min(elements as usize / gpu_count, glwe_sk.polynomial_size().0);
                    bench_group.throughput(Throughput::Elements(elements));
                    bench_group.sample_size(50);
                    bench_group.bench_function(&bench_id, |b| {
                        let setup_encrypted_values = || {
                            let local_streams = cuda_local_streams_core();

                            let plaintext_list = PlaintextList::new(
                                Scalar::ZERO,
                                PlaintextCount(elements_per_stream),
                            );

                            let input_lwe_lists = (0..gpu_count)
                                .map(|i| {
                                    let mut input_ct_list = LweCiphertextList::new(
                                        Scalar::ZERO,
                                        lwe_sk.lwe_dimension().to_lwe_size(),
                                        LweCiphertextCount(elements_per_stream),
                                        ciphertext_modulus,
                                    );
                                    encrypt_lwe_ciphertext_list(
                                        &lwe_sk,
                                        &mut input_ct_list,
                                        &plaintext_list,
                                        params.lwe_noise_distribution.unwrap(),
                                        &mut encryption_generator,
                                    );

                                    CudaLweCiphertextList::from_lwe_ciphertext_list(
                                        &input_ct_list,
                                        &local_streams[i],
                                    )
                                })
                                .collect::<Vec<_>>();

                            let output_glwe_list = (0..gpu_count)
                                .map(|i| {
                                    CudaGlweCiphertextList::new(
                                        glwe_sk.glwe_dimension(),
                                        glwe_sk.polynomial_size(),
                                        GlweCiphertextCount(1),
                                        ciphertext_modulus,
                                        &local_streams[i],
                                    )
                                })
                                .collect::<Vec<_>>();

                            local_streams.iter().for_each(|stream| stream.synchronize());

                            (input_lwe_lists, output_glwe_list, local_streams)
                        };

                        b.iter_batched(
                            setup_encrypted_values,
                            |(input_lwe_lists, mut output_glwe_lists, local_streams)| {
                                (0..gpu_count)
                                    .into_par_iter()
                                    .zip(input_lwe_lists.par_iter())
                                    .zip(output_glwe_lists.par_iter_mut())
                                    .zip(local_streams.par_iter())
                                    .for_each(
                                        |(
                                             ((i, input_lwe_list), output_glwe_list),
                                             local_stream,
                                         )| {
                                            cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
                                                gpu_keys_vec[i].pksk.as_ref().unwrap(),
                                                input_lwe_list,
                                                output_glwe_list,
                                                local_stream,
                                            );
                                        },
                                    )
                            },
                            criterion::BatchSize::SmallInput,
                        )
                    });
                }
            };

            let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
            write_to_json(
                &bench_id,
                *params,
                name,
                "packing_ks",
                &OperatorType::Atomic,
                bit_size,
                vec![bit_size],
            );
        }
    }

    pub fn cuda_ks_group() {
        let mut criterion: Criterion<_> = (Criterion::default().sample_size(15))
            .measurement_time(std::time::Duration::from_secs(60))
            .configure_from_args();
        cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &benchmark_parameters());
        cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &benchmark_parameters());
        cuda_packing_keyswitch(&mut criterion, &benchmark_parameters());
    }

    pub fn cuda_ks_group_documentation() {
        let mut criterion: Criterion<_> = (Criterion::default().sample_size(15))
            .measurement_time(std::time::Duration::from_secs(60))
            .configure_from_args();
        cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &benchmark_parameters());
        cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &benchmark_parameters());
    }

    pub fn cuda_multi_bit_ks_group() {
        let mut criterion: Criterion<_> =
            (Criterion::default().sample_size(2000)).configure_from_args();
        let multi_bit_parameters = multi_bit_benchmark_parameters()
            .into_iter()
            .map(|(string, params, _)| (string, params))
            .collect_vec();
        cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &multi_bit_parameters);
        cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &multi_bit_parameters);
        cuda_packing_keyswitch(&mut criterion, &multi_bit_parameters);
    }

    pub fn cuda_multi_bit_ks_group_documentation() {
        let mut criterion: Criterion<_> =
            (Criterion::default().sample_size(2000)).configure_from_args();
        let multi_bit_parameters = multi_bit_benchmark_parameters()
            .into_iter()
            .map(|(string, params, _)| (string, params))
            .collect_vec();
        cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &multi_bit_parameters);
        cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &multi_bit_parameters);
    }
}

#[cfg(feature = "gpu")]
use cuda::{
    cuda_ks_group, cuda_ks_group_documentation, cuda_multi_bit_ks_group,
    cuda_multi_bit_ks_group_documentation,
};

pub fn ks_group() {
    let mut criterion: Criterion<_> = (Criterion::default()
        .sample_size(15)
        .measurement_time(std::time::Duration::from_secs(60)))
    .configure_from_args();
    keyswitch(&mut criterion, &benchmark_parameters());
    #[cfg(feature = "boolean")]
    keyswitch(&mut criterion, &benchmark_32bits_parameters());
}

pub fn multi_bit_ks_group() {
    let multi_bit_parameters = multi_bit_benchmark_parameters()
        .into_iter()
        .map(|(string, params, _)| (string, params))
        .collect_vec();

    let mut criterion: Criterion<_> = (Criterion::default()
        .sample_size(15)
        .measurement_time(std::time::Duration::from_secs(60)))
    .configure_from_args();
    keyswitch(&mut criterion, &multi_bit_parameters);
}

pub fn packing_ks_group() {
    let mut criterion: Criterion<_> = (Criterion::default()
        .sample_size(10)
        .measurement_time(std::time::Duration::from_secs(30)))
    .configure_from_args();
    packing_keyswitch(
        &mut criterion,
        "packing_keyswitch",
        &benchmark_compression_parameters(),
        keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext,
    );
    packing_keyswitch(
        &mut criterion,
        "par_packing_keyswitch",
        &benchmark_compression_parameters(),
        par_keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext,
    );
}

#[cfg(feature = "gpu")]
fn go_through_gpu_bench_groups() {
    match get_param_type() {
        ParamType::Classical => cuda_ks_group(),
        ParamType::ClassicalDocumentation => cuda_ks_group_documentation(),
        ParamType::MultiBit => cuda_multi_bit_ks_group(),
        ParamType::MultiBitDocumentation => cuda_multi_bit_ks_group_documentation(),
    };
}

#[cfg(not(feature = "gpu"))]
fn go_through_cpu_bench_groups() {
    match get_param_type() {
        ParamType::Classical => {
            ks_group();
            packing_ks_group()
        }
        ParamType::ClassicalDocumentation => ks_group(),
        ParamType::MultiBit | ParamType::MultiBitDocumentation => multi_bit_ks_group(),
    }
}

fn main() {
    #[cfg(feature = "gpu")]
    go_through_gpu_bench_groups();
    #[cfg(not(feature = "gpu"))]
    go_through_cpu_bench_groups();

    Criterion::default().configure_from_args().final_summary();
}