mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(bench): add unbounded oprf integer benchmarks
Also move Cuda OPRF benchmark into the same file as CPU implementation
This commit is contained in:
@@ -1386,15 +1386,13 @@ define_server_key_bench_default_fn!(
|
||||
mod cuda {
|
||||
use super::*;
|
||||
use benchmark::utilities::cuda_integer_utils::{cuda_local_keys, cuda_local_streams};
|
||||
use criterion::{black_box, criterion_group};
|
||||
use criterion::criterion_group;
|
||||
use std::cmp::max;
|
||||
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
|
||||
use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
|
||||
use tfhe::integer::gpu::server_key::CudaServerKey;
|
||||
use tfhe::integer::{RadixCiphertext, ServerKey};
|
||||
use tfhe::GpuIndex;
|
||||
use tfhe_csprng::seeders::Seed;
|
||||
|
||||
fn bench_cuda_server_key_unary_function_clean_inputs<F, G>(
|
||||
c: &mut Criterion,
|
||||
@@ -1942,90 +1940,6 @@ mod cuda {
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
pub fn cuda_unsigned_oprf(c: &mut Criterion) {
|
||||
let bench_name = "integer::cuda::unsigned_oprf";
|
||||
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
let param_name = param.name();
|
||||
|
||||
let bench_id;
|
||||
|
||||
match get_bench_type() {
|
||||
BenchmarkType::Latency => {
|
||||
let streams = CudaStreams::new_multi_gpu();
|
||||
|
||||
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &streams);
|
||||
|
||||
b.iter(|| {
|
||||
_ = black_box(
|
||||
gpu_sks
|
||||
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
&streams,
|
||||
),
|
||||
);
|
||||
})
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks_vec = cuda_local_keys(&cks);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
reset_pbs_count();
|
||||
cpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
b.iter(|| {
|
||||
(0..elements).into_par_iter().for_each(|i| {
|
||||
let gpu_index: u32 = i as u32 % get_number_of_gpus();
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
gpu_sks_vec[gpu_index as usize]
|
||||
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
&stream,
|
||||
);
|
||||
})
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
"oprf",
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
|
||||
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name: $name:ident) => {
|
||||
::paste::paste!{
|
||||
@@ -2772,7 +2686,7 @@ mod cuda {
|
||||
cuda_trailing_zeros,
|
||||
cuda_trailing_ones,
|
||||
cuda_ilog2,
|
||||
cuda_unsigned_oprf,
|
||||
oprf::cuda::cuda_unsigned_oprf,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
@@ -2800,7 +2714,7 @@ mod cuda {
|
||||
cuda_scalar_mul,
|
||||
cuda_scalar_div,
|
||||
cuda_scalar_rem,
|
||||
cuda_unsigned_oprf,
|
||||
oprf::cuda::cuda_unsigned_oprf,
|
||||
);
|
||||
|
||||
criterion_group!(
|
||||
|
||||
@@ -22,12 +22,27 @@ pub fn unsigned_oprf(c: &mut Criterion) {
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
let param_name = param.name();
|
||||
|
||||
let bench_id;
|
||||
let bench_id_oprf;
|
||||
let bench_id_oprf_bounded;
|
||||
|
||||
match get_bench_type() {
|
||||
BenchmarkType::Latency => {
|
||||
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
bench_id_oprf = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
bench_id_oprf_bounded =
|
||||
format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf, |b| {
|
||||
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
b.iter(|| {
|
||||
_ = black_box(sk.par_generate_oblivious_pseudo_random_unsigned_integer(
|
||||
Seed(0),
|
||||
num_block as u64,
|
||||
));
|
||||
})
|
||||
});
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
|
||||
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
b.iter(|| {
|
||||
@@ -53,10 +68,25 @@ pub fn unsigned_oprf(c: &mut Criterion) {
|
||||
);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
bench_id_oprf = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
bench_id_oprf_bounded =
|
||||
format!("{bench_name}_bounded::throughput::{param_name}::{bit_size}_bits");
|
||||
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf, |b| {
|
||||
b.iter(|| {
|
||||
(0..elements).into_par_iter().for_each(|_| {
|
||||
sk.par_generate_oblivious_pseudo_random_unsigned_integer(
|
||||
Seed(0),
|
||||
num_block as u64,
|
||||
);
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
|
||||
b.iter(|| {
|
||||
(0..elements).into_par_iter().for_each(|_| {
|
||||
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
@@ -70,16 +100,161 @@ pub fn unsigned_oprf(c: &mut Criterion) {
|
||||
}
|
||||
}
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
"oprf",
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
for (bench_id, display_name) in [
|
||||
(bench_id_oprf, "oprf"),
|
||||
(bench_id_oprf_bounded, "oprf_bounded"),
|
||||
] {
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub mod cuda {
|
||||
use super::*;
|
||||
use benchmark::utilities::cuda_integer_utils::cuda_local_keys;
|
||||
use criterion::black_box;
|
||||
use std::cmp::max;
|
||||
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
|
||||
use tfhe::integer::gpu::server_key::CudaServerKey;
|
||||
use tfhe::GpuIndex;
|
||||
use tfhe_csprng::seeders::Seed;
|
||||
|
||||
pub fn cuda_unsigned_oprf(c: &mut Criterion) {
|
||||
let bench_name = "integer::cuda::unsigned_oprf";
|
||||
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
let param_name = param.name();
|
||||
|
||||
let bench_id_oprf;
|
||||
let bench_id_oprf_bounded;
|
||||
|
||||
match get_bench_type() {
|
||||
BenchmarkType::Latency => {
|
||||
let streams = CudaStreams::new_multi_gpu();
|
||||
|
||||
bench_id_oprf = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
bench_id_oprf_bounded =
|
||||
format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &streams);
|
||||
|
||||
b.iter(|| {
|
||||
_ = black_box(
|
||||
gpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer(
|
||||
Seed(0),
|
||||
num_block as u64,
|
||||
&streams,
|
||||
),
|
||||
);
|
||||
})
|
||||
});
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &streams);
|
||||
|
||||
b.iter(|| {
|
||||
_ = black_box(
|
||||
gpu_sks
|
||||
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
&streams,
|
||||
),
|
||||
);
|
||||
})
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks_vec = cuda_local_keys(&cks);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
reset_pbs_count();
|
||||
cpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id_oprf =
|
||||
format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
bench_id_oprf_bounded =
|
||||
format!("{bench_name}_bounded::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf, |b| {
|
||||
b.iter(|| {
|
||||
(0..elements).into_par_iter().for_each(|i| {
|
||||
let gpu_index: u32 = i as u32 % get_number_of_gpus();
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
gpu_sks_vec[gpu_index as usize]
|
||||
.par_generate_oblivious_pseudo_random_unsigned_integer(
|
||||
Seed(0),
|
||||
num_block as u64,
|
||||
&stream,
|
||||
);
|
||||
})
|
||||
})
|
||||
});
|
||||
|
||||
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
|
||||
b.iter(|| {
|
||||
(0..elements).into_par_iter().for_each(|i| {
|
||||
let gpu_index: u32 = i as u32 % get_number_of_gpus();
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
gpu_sks_vec[gpu_index as usize]
|
||||
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
&stream,
|
||||
);
|
||||
})
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
for (bench_id, display_name) in [
|
||||
(bench_id_oprf, "oprf"),
|
||||
(bench_id_oprf_bounded, "oprf_bounded"),
|
||||
] {
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user