mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-08 06:13:58 -05:00
982 lines
42 KiB
Rust
982 lines
42 KiB
Rust
#[cfg(feature = "boolean")]
|
|
use benchmark::params::benchmark_32bits_parameters;
|
|
use benchmark::params::{
|
|
benchmark_compression_parameters, benchmark_parameters, multi_bit_benchmark_parameters,
|
|
};
|
|
use benchmark::utilities::{
|
|
get_bench_type, get_param_type, throughput_num_threads, write_to_json, BenchmarkType,
|
|
CryptoParametersRecord, OperatorType, ParamType,
|
|
};
|
|
use criterion::{black_box, Criterion, Throughput};
|
|
use itertools::Itertools;
|
|
use rayon::prelude::*;
|
|
use serde::Serialize;
|
|
use tfhe::core_crypto::prelude::*;
|
|
|
|
// TODO Refactor KS, PBS and KS-PBS benchmarks into a single generic function.
|
|
fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(
|
|
criterion: &mut Criterion,
|
|
parameters: &[(String, CryptoParametersRecord<Scalar>)],
|
|
) {
|
|
let bench_name = "core_crypto::keyswitch";
|
|
let mut bench_group = criterion.benchmark_group(bench_name);
|
|
|
|
// Create the PRNG
|
|
let mut seeder = new_seeder();
|
|
let seeder = seeder.as_mut();
|
|
let mut encryption_generator =
|
|
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
|
|
let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
|
|
|
|
for (name, params) in parameters.iter() {
|
|
let lwe_dimension = params.lwe_dimension.unwrap();
|
|
let glwe_dimension = params.glwe_dimension.unwrap();
|
|
let polynomial_size = params.polynomial_size.unwrap();
|
|
let ks_decomp_base_log = params.ks_base_log.unwrap();
|
|
let ks_decomp_level_count = params.ks_level.unwrap();
|
|
|
|
let lwe_sk =
|
|
allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);
|
|
|
|
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
|
|
glwe_dimension,
|
|
polynomial_size,
|
|
&mut secret_generator,
|
|
);
|
|
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
|
|
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
|
|
&big_lwe_sk,
|
|
&lwe_sk,
|
|
ks_decomp_base_log,
|
|
ks_decomp_level_count,
|
|
params.lwe_noise_distribution.unwrap(),
|
|
params.ciphertext_modulus.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let bench_id;
|
|
|
|
match get_bench_type() {
|
|
BenchmarkType::Latency => {
|
|
let ct = allocate_and_encrypt_new_lwe_ciphertext(
|
|
&big_lwe_sk,
|
|
Plaintext(Scalar::ONE),
|
|
params.lwe_noise_distribution.unwrap(),
|
|
params.ciphertext_modulus.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let mut output_ct = LweCiphertext::new(
|
|
Scalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
params.ciphertext_modulus.unwrap(),
|
|
);
|
|
|
|
bench_id = format!("{bench_name}::{name}");
|
|
{
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
b.iter(|| {
|
|
keyswitch_lwe_ciphertext(&ksk_big_to_small, &ct, &mut output_ct);
|
|
black_box(&mut output_ct);
|
|
})
|
|
});
|
|
}
|
|
}
|
|
BenchmarkType::Throughput => {
|
|
bench_id = format!("{bench_name}::throughput::{name}");
|
|
let blocks: usize = 1;
|
|
let elements = throughput_num_threads(blocks, 1); // FIXME This number of element do not staturate the target machine
|
|
bench_group.throughput(Throughput::Elements(elements));
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
let setup_encrypted_values = || {
|
|
let input_cts = (0..elements)
|
|
.map(|_| {
|
|
allocate_and_encrypt_new_lwe_ciphertext(
|
|
&big_lwe_sk,
|
|
Plaintext(Scalar::ONE),
|
|
params.lwe_noise_distribution.unwrap(),
|
|
params.ciphertext_modulus.unwrap(),
|
|
&mut encryption_generator,
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let output_cts = (0..elements)
|
|
.map(|_| {
|
|
LweCiphertext::new(
|
|
Scalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
params.ciphertext_modulus.unwrap(),
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
(input_cts, output_cts)
|
|
};
|
|
|
|
b.iter_batched(
|
|
setup_encrypted_values,
|
|
|(input_cts, mut output_cts)| {
|
|
input_cts
|
|
.par_iter()
|
|
.zip(output_cts.par_iter_mut())
|
|
.for_each(|(input_ct, output_ct)| {
|
|
keyswitch_lwe_ciphertext(
|
|
&ksk_big_to_small,
|
|
input_ct,
|
|
output_ct,
|
|
);
|
|
})
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
}
|
|
};
|
|
|
|
let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
|
|
write_to_json(
|
|
&bench_id,
|
|
*params,
|
|
name,
|
|
"ks",
|
|
&OperatorType::Atomic,
|
|
bit_size,
|
|
vec![bit_size],
|
|
);
|
|
}
|
|
}
|
|
|
|
fn packing_keyswitch<Scalar, F>(
|
|
criterion: &mut Criterion,
|
|
bench_name: &str,
|
|
parameters: &[(String, CryptoParametersRecord<Scalar>)],
|
|
ks_op: F,
|
|
) where
|
|
Scalar: UnsignedTorus + CastInto<usize> + Serialize,
|
|
F: Fn(
|
|
&LwePackingKeyswitchKey<Vec<Scalar>>,
|
|
&LweCiphertextList<Vec<Scalar>>,
|
|
&mut GlweCiphertext<Vec<Scalar>>,
|
|
) + Sync
|
|
+ Send,
|
|
{
|
|
let bench_name = format!("core_crypto::{bench_name}");
|
|
let mut bench_group = criterion.benchmark_group(&bench_name);
|
|
|
|
// Create the PRNG
|
|
let mut seeder = new_seeder();
|
|
let seeder = seeder.as_mut();
|
|
let mut encryption_generator =
|
|
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
|
|
let mut secret_generator = SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
|
|
|
|
for (name, params) in parameters.iter() {
|
|
let lwe_dimension = params.lwe_dimension.unwrap();
|
|
let packing_glwe_dimension = params.packing_ks_glwe_dimension.unwrap();
|
|
let packing_polynomial_size = params.packing_ks_polynomial_size.unwrap();
|
|
let packing_ks_decomp_base_log = params.packing_ks_base_log.unwrap();
|
|
let packing_ks_decomp_level_count = params.packing_ks_level.unwrap();
|
|
let ciphertext_modulus = params.ciphertext_modulus.unwrap();
|
|
let count = params.lwe_per_glwe.unwrap();
|
|
|
|
let lwe_sk =
|
|
allocate_and_generate_new_binary_lwe_secret_key(lwe_dimension, &mut secret_generator);
|
|
|
|
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
|
|
packing_glwe_dimension,
|
|
packing_polynomial_size,
|
|
&mut secret_generator,
|
|
);
|
|
|
|
let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
|
|
&lwe_sk,
|
|
&glwe_sk,
|
|
packing_ks_decomp_base_log,
|
|
packing_ks_decomp_level_count,
|
|
params.packing_ks_key_noise_distribution.unwrap(),
|
|
ciphertext_modulus,
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let bench_id;
|
|
|
|
match get_bench_type() {
|
|
BenchmarkType::Latency => {
|
|
let mut input_lwe_list = LweCiphertextList::new(
|
|
Scalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
count,
|
|
ciphertext_modulus,
|
|
);
|
|
|
|
let plaintext_list = PlaintextList::new(
|
|
Scalar::ZERO,
|
|
PlaintextCount(input_lwe_list.lwe_ciphertext_count().0),
|
|
);
|
|
|
|
encrypt_lwe_ciphertext_list(
|
|
&lwe_sk,
|
|
&mut input_lwe_list,
|
|
&plaintext_list,
|
|
params.lwe_noise_distribution.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let mut output_glwe = GlweCiphertext::new(
|
|
Scalar::ZERO,
|
|
glwe_sk.glwe_dimension().to_glwe_size(),
|
|
glwe_sk.polynomial_size(),
|
|
ciphertext_modulus,
|
|
);
|
|
|
|
bench_id = format!("{bench_name}::{name}");
|
|
{
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
b.iter(|| {
|
|
ks_op(&pksk, &input_lwe_list, &mut output_glwe);
|
|
black_box(&mut output_glwe);
|
|
})
|
|
});
|
|
}
|
|
}
|
|
BenchmarkType::Throughput => {
|
|
bench_id = format!("{bench_name}::throughput::{name}");
|
|
let blocks: usize = 1;
|
|
let elements = throughput_num_threads(blocks, 1);
|
|
bench_group.throughput(Throughput::Elements(elements));
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
let setup_encrypted_values = || {
|
|
let input_lwe_lists = (0..elements)
|
|
.map(|_| {
|
|
let mut input_lwe_list = LweCiphertextList::new(
|
|
Scalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
count,
|
|
ciphertext_modulus,
|
|
);
|
|
|
|
let plaintext_list = PlaintextList::new(
|
|
Scalar::ZERO,
|
|
PlaintextCount(input_lwe_list.lwe_ciphertext_count().0),
|
|
);
|
|
|
|
encrypt_lwe_ciphertext_list(
|
|
&lwe_sk,
|
|
&mut input_lwe_list,
|
|
&plaintext_list,
|
|
params.lwe_noise_distribution.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
input_lwe_list
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let output_glwes = (0..elements)
|
|
.map(|_| {
|
|
GlweCiphertext::new(
|
|
Scalar::ZERO,
|
|
glwe_sk.glwe_dimension().to_glwe_size(),
|
|
glwe_sk.polynomial_size(),
|
|
ciphertext_modulus,
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
(input_lwe_lists, output_glwes)
|
|
};
|
|
|
|
b.iter_batched(
|
|
setup_encrypted_values,
|
|
|(input_lwe_lists, mut output_glwes)| {
|
|
input_lwe_lists
|
|
.par_iter()
|
|
.zip(output_glwes.par_iter_mut())
|
|
.for_each(|(input_lwe_list, output_glwe)| {
|
|
ks_op(&pksk, input_lwe_list, output_glwe);
|
|
})
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
}
|
|
};
|
|
|
|
let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
|
|
write_to_json(
|
|
&bench_id,
|
|
*params,
|
|
name,
|
|
"packing_ks",
|
|
&OperatorType::Atomic,
|
|
bit_size,
|
|
vec![bit_size],
|
|
);
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "gpu")]
|
|
mod cuda {
|
|
use benchmark::params::{benchmark_parameters, multi_bit_benchmark_parameters};
|
|
use benchmark::utilities::{
|
|
cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
|
|
write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
|
|
CudaLocalKeys, OperatorType,
|
|
};
|
|
use criterion::{black_box, Criterion, Throughput};
|
|
use itertools::Itertools;
|
|
use rayon::prelude::*;
|
|
use serde::Serialize;
|
|
use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
|
use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
|
use tfhe::core_crypto::gpu::vec::GpuIndex;
|
|
use tfhe::core_crypto::gpu::{
|
|
check_valid_cuda_malloc, cuda_keyswitch_lwe_ciphertext,
|
|
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64, get_number_of_gpus,
|
|
get_packing_keyswitch_list_64_size_on_gpu, CudaStreams,
|
|
};
|
|
|
|
use tfhe::core_crypto::prelude::*;
|
|
|
|
fn cuda_keyswitch_classical_and_gemm<
|
|
Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
|
|
KeyswitchScalar: UnsignedTorus + CastFrom<Scalar>,
|
|
>(
|
|
criterion: &mut Criterion,
|
|
parameters: &[(String, CryptoParametersRecord<Scalar>)],
|
|
) {
|
|
let bench_name = "core_crypto::cuda::keyswitch";
|
|
let mut bench_group = criterion.benchmark_group(bench_name);
|
|
|
|
// Create the PRNG
|
|
let mut seeder = new_seeder();
|
|
let seeder = seeder.as_mut();
|
|
let mut encryption_generator =
|
|
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
|
|
let mut secret_generator =
|
|
SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
|
|
|
|
for (name, params) in parameters.iter() {
|
|
let lwe_dimension = params.lwe_dimension.unwrap();
|
|
let glwe_dimension = params.glwe_dimension.unwrap();
|
|
let polynomial_size = params.polynomial_size.unwrap();
|
|
let ks_decomp_base_log = params.ks_base_log.unwrap();
|
|
let ks_decomp_level_count = params.ks_level.unwrap();
|
|
|
|
let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
|
|
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
|
|
DynamicDistribution::<KeyswitchScalar>::new_gaussian(
|
|
gaussian_lwe_noise_distribution.standard_dev(),
|
|
)
|
|
}
|
|
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
|
|
DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
|
|
match KeyswitchScalar::BITS {
|
|
32 => uniform_lwe_noise_distribution.bound_log2() - 32,
|
|
64 => uniform_lwe_noise_distribution.bound_log2(),
|
|
_ => panic!("Unsupported Keyswitch scalar input dtype"),
|
|
},
|
|
)
|
|
}
|
|
};
|
|
|
|
let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
|
|
allocate_and_generate_new_binary_lwe_secret_key(
|
|
lwe_dimension,
|
|
&mut secret_generator,
|
|
);
|
|
|
|
let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
|
|
allocate_and_generate_new_binary_glwe_secret_key(
|
|
glwe_dimension,
|
|
polynomial_size,
|
|
&mut secret_generator,
|
|
);
|
|
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
|
|
|
|
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
|
|
&big_lwe_sk,
|
|
&lwe_sk,
|
|
ks_decomp_base_log,
|
|
ks_decomp_level_count,
|
|
lwe_noise_distribution_ksk,
|
|
CiphertextModulus::new_native(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
|
|
allocate_and_generate_new_binary_glwe_secret_key(
|
|
glwe_dimension,
|
|
polynomial_size,
|
|
&mut secret_generator,
|
|
);
|
|
|
|
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
|
|
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
|
|
|
|
let cpu_keys: CpuKeys<Scalar, KeyswitchScalar> = CpuKeysBuilder::new()
|
|
.keyswitch_key(ksk_big_to_small)
|
|
.build();
|
|
|
|
let mut bench_id;
|
|
|
|
match get_bench_type() {
|
|
BenchmarkType::Latency => {
|
|
let streams = CudaStreams::new_multi_gpu();
|
|
let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
|
|
|
|
let ct = allocate_and_encrypt_new_lwe_ciphertext(
|
|
&big_lwe_sk_64,
|
|
Plaintext(Scalar::ONE),
|
|
params.lwe_noise_distribution.unwrap(),
|
|
CiphertextModulus::new_native(),
|
|
&mut encryption_generator,
|
|
);
|
|
let mut ct_gpu = CudaLweCiphertextList::from_lwe_ciphertext(&ct, &streams);
|
|
|
|
let output_ct = LweCiphertext::new(
|
|
KeyswitchScalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
CiphertextModulus::new_native(),
|
|
);
|
|
let mut output_ct_gpu =
|
|
CudaLweCiphertextList::from_lwe_ciphertext(&output_ct, &streams);
|
|
|
|
let h_indexes = [Scalar::ZERO];
|
|
let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
|
|
|
|
bench_id = format!(
|
|
"{bench_name}::latency::{:?}b::{name}",
|
|
KeyswitchScalar::BITS
|
|
);
|
|
{
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
b.iter(|| {
|
|
cuda_keyswitch_lwe_ciphertext(
|
|
gpu_keys.ksk.as_ref().unwrap(),
|
|
&ct_gpu,
|
|
&mut output_ct_gpu,
|
|
&cuda_indexes.d_input,
|
|
&cuda_indexes.d_output,
|
|
true,
|
|
&streams,
|
|
false,
|
|
);
|
|
|
|
black_box(&mut ct_gpu);
|
|
})
|
|
});
|
|
}
|
|
|
|
let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
|
|
write_to_json(
|
|
&bench_id,
|
|
*params,
|
|
name,
|
|
"ks",
|
|
&OperatorType::Atomic,
|
|
bit_size,
|
|
vec![bit_size],
|
|
);
|
|
}
|
|
BenchmarkType::Throughput => {
|
|
let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
|
|
let gpu_count = get_number_of_gpus() as usize;
|
|
|
|
for uses_gemm_ks in [false, true] {
|
|
for uses_trivial_indices in [false, true] {
|
|
let indices_str = if uses_trivial_indices {
|
|
"trivial"
|
|
} else {
|
|
"complex"
|
|
};
|
|
let gemm_str = if uses_gemm_ks { "gemm" } else { "classical" };
|
|
bench_id = format!(
|
|
"{bench_name}::throughput::{:?}b::{gemm_str}::{indices_str}_indices::{name}",
|
|
KeyswitchScalar::BITS
|
|
);
|
|
|
|
let blocks: usize = 256;
|
|
let elements = gpu_count * blocks;
|
|
let elements_per_stream = elements / gpu_count;
|
|
bench_group.throughput(Throughput::Elements(elements as u64));
|
|
bench_group.sample_size(50);
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
let setup_encrypted_values = || {
|
|
let local_streams = cuda_local_streams_core();
|
|
|
|
let plaintext_list = PlaintextList::new(
|
|
Scalar::ZERO,
|
|
PlaintextCount(elements_per_stream),
|
|
);
|
|
|
|
let input_cts = (0..gpu_count)
|
|
.map(|i| {
|
|
let mut input_ct_list = LweCiphertextList::new(
|
|
Scalar::ZERO,
|
|
big_lwe_sk.lwe_dimension().to_lwe_size(),
|
|
LweCiphertextCount(elements_per_stream),
|
|
params.ciphertext_modulus.unwrap(),
|
|
);
|
|
encrypt_lwe_ciphertext_list(
|
|
&big_lwe_sk_64,
|
|
&mut input_ct_list,
|
|
&plaintext_list,
|
|
params.lwe_noise_distribution.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
let input_ks_list = LweCiphertextList::from_container(
|
|
input_ct_list.into_container(),
|
|
big_lwe_sk.lwe_dimension().to_lwe_size(),
|
|
params.ciphertext_modulus.unwrap(),
|
|
);
|
|
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
|
&input_ks_list,
|
|
&local_streams[i],
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let output_cts = (0..gpu_count)
|
|
.map(|i| {
|
|
let output_ct_list = LweCiphertextList::new(
|
|
KeyswitchScalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
LweCiphertextCount(elements_per_stream),
|
|
ciphertext_modulus_out,
|
|
);
|
|
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
|
&output_ct_list,
|
|
&local_streams[i],
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let indexes_range: Vec<u64> = if uses_trivial_indices {
|
|
(0..(elements / gpu_count) as u64).collect()
|
|
} else {
|
|
(0..(elements / gpu_count) as u64).rev().collect()
|
|
};
|
|
let h_indexes = indexes_range
|
|
.iter()
|
|
.map(|v| CastFrom::cast_from(*v))
|
|
.collect::<Vec<_>>();
|
|
|
|
let cuda_indexes_vec = (0..gpu_count)
|
|
.map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
|
|
.collect::<Vec<_>>();
|
|
local_streams.iter().for_each(|stream| stream.synchronize());
|
|
|
|
(input_cts, output_cts, cuda_indexes_vec, local_streams)
|
|
};
|
|
|
|
b.iter_batched(
|
|
setup_encrypted_values,
|
|
|(
|
|
input_cts,
|
|
mut output_cts,
|
|
cuda_indexes_vec,
|
|
local_streams,
|
|
)| {
|
|
(0..gpu_count)
|
|
.into_par_iter()
|
|
.zip(input_cts.par_iter())
|
|
.zip(output_cts.par_iter_mut())
|
|
.zip(local_streams.par_iter())
|
|
.for_each(
|
|
|(((i, input_ct), output_ct), local_stream)| {
|
|
cuda_keyswitch_lwe_ciphertext(
|
|
gpu_keys_vec[i].ksk.as_ref().unwrap(),
|
|
input_ct,
|
|
output_ct,
|
|
&cuda_indexes_vec[i].d_input,
|
|
&cuda_indexes_vec[i].d_output,
|
|
uses_trivial_indices,
|
|
local_stream,
|
|
uses_gemm_ks,
|
|
);
|
|
},
|
|
)
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
|
|
let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
|
|
write_to_json(
|
|
&bench_id,
|
|
*params,
|
|
name,
|
|
"ks",
|
|
&OperatorType::Atomic,
|
|
bit_size,
|
|
vec![bit_size],
|
|
);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
}
|
|
}
|
|
|
|
fn cuda_packing_keyswitch<
|
|
Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize + CastInto<u32>,
|
|
>(
|
|
criterion: &mut Criterion,
|
|
parameters: &[(String, CryptoParametersRecord<Scalar>)],
|
|
) {
|
|
let bench_name = "core_crypto::cuda::packing_keyswitch";
|
|
let mut bench_group = criterion.benchmark_group(bench_name);
|
|
|
|
// Create the PRNG
|
|
let mut seeder = new_seeder();
|
|
let seeder = seeder.as_mut();
|
|
let mut encryption_generator =
|
|
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
|
|
let mut secret_generator =
|
|
SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
|
|
|
|
for (name, params) in parameters.iter() {
|
|
let lwe_dimension = params.lwe_dimension.unwrap();
|
|
let glwe_dimension = params.glwe_dimension.unwrap();
|
|
let polynomial_size = params.polynomial_size.unwrap();
|
|
let ks_decomp_base_log = params.ks_base_log.unwrap();
|
|
let ks_decomp_level_count = params.ks_level.unwrap();
|
|
let glwe_noise_distribution = params.glwe_noise_distribution.unwrap();
|
|
let ciphertext_modulus = params.ciphertext_modulus.unwrap();
|
|
|
|
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
|
|
lwe_dimension,
|
|
&mut secret_generator,
|
|
);
|
|
|
|
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
|
|
glwe_dimension,
|
|
polynomial_size,
|
|
&mut secret_generator,
|
|
);
|
|
|
|
let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
|
|
&lwe_sk,
|
|
&glwe_sk,
|
|
ks_decomp_base_log,
|
|
ks_decomp_level_count,
|
|
glwe_noise_distribution,
|
|
ciphertext_modulus,
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let cpu_keys: CpuKeys<Scalar, Scalar> =
|
|
CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
|
|
|
|
let bench_id;
|
|
match get_bench_type() {
|
|
BenchmarkType::Latency => {
|
|
let streams = CudaStreams::new_multi_gpu();
|
|
|
|
let mem_size = get_packing_keyswitch_list_64_size_on_gpu(
|
|
&streams,
|
|
lwe_sk.lwe_dimension(),
|
|
glwe_sk.glwe_dimension(),
|
|
glwe_sk.polynomial_size(),
|
|
LweCiphertextCount(glwe_sk.polynomial_size().0),
|
|
);
|
|
|
|
let skip_bench = !check_valid_cuda_malloc(mem_size, GpuIndex::new(0));
|
|
|
|
if skip_bench {
|
|
continue;
|
|
}
|
|
|
|
let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
|
|
|
|
let mut input_ct_list = LweCiphertextList::new(
|
|
Scalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
LweCiphertextCount(glwe_sk.polynomial_size().0),
|
|
ciphertext_modulus,
|
|
);
|
|
|
|
let plaintext_list = PlaintextList::new(
|
|
Scalar::ZERO,
|
|
PlaintextCount(input_ct_list.lwe_ciphertext_count().0),
|
|
);
|
|
|
|
encrypt_lwe_ciphertext_list(
|
|
&lwe_sk,
|
|
&mut input_ct_list,
|
|
&plaintext_list,
|
|
params.lwe_noise_distribution.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
let mut d_input_lwe_list =
|
|
CudaLweCiphertextList::from_lwe_ciphertext_list(&input_ct_list, &streams);
|
|
|
|
let mut d_output_glwe = CudaGlweCiphertextList::new(
|
|
glwe_sk.glwe_dimension(),
|
|
glwe_sk.polynomial_size(),
|
|
GlweCiphertextCount(1),
|
|
ciphertext_modulus,
|
|
&streams,
|
|
);
|
|
|
|
streams.synchronize();
|
|
|
|
bench_id = format!("{bench_name}::{name}");
|
|
{
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
b.iter(|| {
|
|
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
|
|
gpu_keys.pksk.as_ref().unwrap(),
|
|
&d_input_lwe_list,
|
|
&mut d_output_glwe,
|
|
&streams,
|
|
);
|
|
black_box(&mut d_input_lwe_list);
|
|
})
|
|
});
|
|
}
|
|
}
|
|
BenchmarkType::Throughput => {
|
|
let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
|
|
let gpu_count = get_number_of_gpus() as usize;
|
|
|
|
bench_id = format!("{bench_name}::throughput::{name}");
|
|
|
|
let mem_size = get_packing_keyswitch_list_64_size_on_gpu(
|
|
&CudaStreams::new_single_gpu(GpuIndex::new(0)),
|
|
lwe_sk.lwe_dimension(),
|
|
glwe_sk.glwe_dimension(),
|
|
glwe_sk.polynomial_size(),
|
|
LweCiphertextCount(glwe_sk.polynomial_size().0),
|
|
);
|
|
|
|
let mut skip_test = false;
|
|
for gpu_index in 0..gpu_count {
|
|
if !check_valid_cuda_malloc(mem_size, GpuIndex::new(gpu_index as u32)) {
|
|
skip_test = true;
|
|
}
|
|
}
|
|
|
|
if skip_test {
|
|
continue;
|
|
}
|
|
|
|
let blocks: usize = 1;
|
|
let elements = throughput_num_threads(blocks, 1);
|
|
let elements_per_stream =
|
|
std::cmp::min(elements as usize / gpu_count, glwe_sk.polynomial_size().0);
|
|
bench_group.throughput(Throughput::Elements(elements));
|
|
bench_group.sample_size(50);
|
|
bench_group.bench_function(&bench_id, |b| {
|
|
let setup_encrypted_values = || {
|
|
let local_streams = cuda_local_streams_core();
|
|
|
|
let plaintext_list = PlaintextList::new(
|
|
Scalar::ZERO,
|
|
PlaintextCount(elements_per_stream),
|
|
);
|
|
|
|
let input_lwe_lists = (0..gpu_count)
|
|
.map(|i| {
|
|
let mut input_ct_list = LweCiphertextList::new(
|
|
Scalar::ZERO,
|
|
lwe_sk.lwe_dimension().to_lwe_size(),
|
|
LweCiphertextCount(elements_per_stream),
|
|
ciphertext_modulus,
|
|
);
|
|
encrypt_lwe_ciphertext_list(
|
|
&lwe_sk,
|
|
&mut input_ct_list,
|
|
&plaintext_list,
|
|
params.lwe_noise_distribution.unwrap(),
|
|
&mut encryption_generator,
|
|
);
|
|
|
|
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
|
&input_ct_list,
|
|
&local_streams[i],
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
let output_glwe_list = (0..gpu_count)
|
|
.map(|i| {
|
|
CudaGlweCiphertextList::new(
|
|
glwe_sk.glwe_dimension(),
|
|
glwe_sk.polynomial_size(),
|
|
GlweCiphertextCount(1),
|
|
ciphertext_modulus,
|
|
&local_streams[i],
|
|
)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
local_streams.iter().for_each(|stream| stream.synchronize());
|
|
|
|
(input_lwe_lists, output_glwe_list, local_streams)
|
|
};
|
|
|
|
b.iter_batched(
|
|
setup_encrypted_values,
|
|
|(input_lwe_lists, mut output_glwe_lists, local_streams)| {
|
|
(0..gpu_count)
|
|
.into_par_iter()
|
|
.zip(input_lwe_lists.par_iter())
|
|
.zip(output_glwe_lists.par_iter_mut())
|
|
.zip(local_streams.par_iter())
|
|
.for_each(
|
|
|(
|
|
((i, input_lwe_list), output_glwe_list),
|
|
local_stream,
|
|
)| {
|
|
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
|
|
gpu_keys_vec[i].pksk.as_ref().unwrap(),
|
|
input_lwe_list,
|
|
output_glwe_list,
|
|
local_stream,
|
|
);
|
|
},
|
|
)
|
|
},
|
|
criterion::BatchSize::SmallInput,
|
|
)
|
|
});
|
|
}
|
|
};
|
|
|
|
let bit_size = (params.message_modulus.unwrap_or(2) as u32).ilog2();
|
|
write_to_json(
|
|
&bench_id,
|
|
*params,
|
|
name,
|
|
"packing_ks",
|
|
&OperatorType::Atomic,
|
|
bit_size,
|
|
vec![bit_size],
|
|
);
|
|
}
|
|
}
|
|
|
|
pub fn cuda_ks_group() {
|
|
let mut criterion: Criterion<_> = (Criterion::default().sample_size(15))
|
|
.measurement_time(std::time::Duration::from_secs(60))
|
|
.configure_from_args();
|
|
cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &benchmark_parameters());
|
|
cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &benchmark_parameters());
|
|
cuda_packing_keyswitch(&mut criterion, &benchmark_parameters());
|
|
}
|
|
|
|
pub fn cuda_ks_group_documentation() {
|
|
let mut criterion: Criterion<_> = (Criterion::default().sample_size(15))
|
|
.measurement_time(std::time::Duration::from_secs(60))
|
|
.configure_from_args();
|
|
cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &benchmark_parameters());
|
|
cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &benchmark_parameters());
|
|
}
|
|
|
|
pub fn cuda_multi_bit_ks_group() {
|
|
let mut criterion: Criterion<_> =
|
|
(Criterion::default().sample_size(2000)).configure_from_args();
|
|
let multi_bit_parameters = multi_bit_benchmark_parameters()
|
|
.into_iter()
|
|
.map(|(string, params, _)| (string, params))
|
|
.collect_vec();
|
|
cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &multi_bit_parameters);
|
|
cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &multi_bit_parameters);
|
|
cuda_packing_keyswitch(&mut criterion, &multi_bit_parameters);
|
|
}
|
|
|
|
pub fn cuda_multi_bit_ks_group_documentation() {
|
|
let mut criterion: Criterion<_> =
|
|
(Criterion::default().sample_size(2000)).configure_from_args();
|
|
let multi_bit_parameters = multi_bit_benchmark_parameters()
|
|
.into_iter()
|
|
.map(|(string, params, _)| (string, params))
|
|
.collect_vec();
|
|
cuda_keyswitch_classical_and_gemm::<u64, u32>(&mut criterion, &multi_bit_parameters);
|
|
cuda_keyswitch_classical_and_gemm::<u64, u64>(&mut criterion, &multi_bit_parameters);
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "gpu")]
|
|
use cuda::{
|
|
cuda_ks_group, cuda_ks_group_documentation, cuda_multi_bit_ks_group,
|
|
cuda_multi_bit_ks_group_documentation,
|
|
};
|
|
|
|
pub fn ks_group() {
|
|
let mut criterion: Criterion<_> = (Criterion::default()
|
|
.sample_size(15)
|
|
.measurement_time(std::time::Duration::from_secs(60)))
|
|
.configure_from_args();
|
|
keyswitch(&mut criterion, &benchmark_parameters());
|
|
#[cfg(feature = "boolean")]
|
|
keyswitch(&mut criterion, &benchmark_32bits_parameters());
|
|
}
|
|
|
|
pub fn multi_bit_ks_group() {
|
|
let multi_bit_parameters = multi_bit_benchmark_parameters()
|
|
.into_iter()
|
|
.map(|(string, params, _)| (string, params))
|
|
.collect_vec();
|
|
|
|
let mut criterion: Criterion<_> = (Criterion::default()
|
|
.sample_size(15)
|
|
.measurement_time(std::time::Duration::from_secs(60)))
|
|
.configure_from_args();
|
|
keyswitch(&mut criterion, &multi_bit_parameters);
|
|
}
|
|
|
|
pub fn packing_ks_group() {
|
|
let mut criterion: Criterion<_> = (Criterion::default()
|
|
.sample_size(10)
|
|
.measurement_time(std::time::Duration::from_secs(30)))
|
|
.configure_from_args();
|
|
packing_keyswitch(
|
|
&mut criterion,
|
|
"packing_keyswitch",
|
|
&benchmark_compression_parameters(),
|
|
keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext,
|
|
);
|
|
packing_keyswitch(
|
|
&mut criterion,
|
|
"par_packing_keyswitch",
|
|
&benchmark_compression_parameters(),
|
|
par_keyswitch_lwe_ciphertext_list_and_pack_in_glwe_ciphertext,
|
|
);
|
|
}
|
|
|
|
#[cfg(feature = "gpu")]
|
|
fn go_through_gpu_bench_groups() {
|
|
match get_param_type() {
|
|
ParamType::Classical => cuda_ks_group(),
|
|
ParamType::ClassicalDocumentation => cuda_ks_group_documentation(),
|
|
ParamType::MultiBit => cuda_multi_bit_ks_group(),
|
|
ParamType::MultiBitDocumentation => cuda_multi_bit_ks_group_documentation(),
|
|
};
|
|
}
|
|
|
|
#[cfg(not(feature = "gpu"))]
|
|
fn go_through_cpu_bench_groups() {
|
|
match get_param_type() {
|
|
ParamType::Classical => {
|
|
ks_group();
|
|
packing_ks_group()
|
|
}
|
|
ParamType::ClassicalDocumentation => ks_group(),
|
|
ParamType::MultiBit | ParamType::MultiBitDocumentation => multi_bit_ks_group(),
|
|
}
|
|
}
|
|
|
|
fn main() {
|
|
#[cfg(feature = "gpu")]
|
|
go_through_gpu_bench_groups();
|
|
#[cfg(not(feature = "gpu"))]
|
|
go_through_cpu_bench_groups();
|
|
|
|
Criterion::default().configure_from_args().final_summary();
|
|
}
|