chore(bench): add unbounded oprf integer benchmarks

Also move Cuda OPRF benchmark into the same file as CPU implementation
This commit is contained in:
David Testé
2025-08-21 14:48:44 +02:00
committed by David Testé
parent 53da030831
commit 4b6942a0f8
2 changed files with 192 additions and 103 deletions

View File

@@ -1386,15 +1386,13 @@ define_server_key_bench_default_fn!(
mod cuda {
use super::*;
use benchmark::utilities::cuda_integer_utils::{cuda_local_keys, cuda_local_streams};
use criterion::{black_box, criterion_group};
use criterion::criterion_group;
use std::cmp::max;
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
use tfhe::integer::gpu::server_key::CudaServerKey;
use tfhe::integer::{RadixCiphertext, ServerKey};
use tfhe::GpuIndex;
use tfhe_csprng::seeders::Seed;
fn bench_cuda_server_key_unary_function_clean_inputs<F, G>(
c: &mut Criterion,
@@ -1942,90 +1940,6 @@ mod cuda {
bench_group.finish()
}
pub fn cuda_unsigned_oprf(c: &mut Criterion) {
let bench_name = "integer::cuda::unsigned_oprf";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
let streams = CudaStreams::new_multi_gpu();
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
b.iter(|| {
_ = black_box(
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&streams,
),
);
})
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks_vec = cuda_local_keys(&cks);
// Execute the operation once to know its cost.
reset_pbs_count();
cpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|i| {
let gpu_index: u32 = i as u32 % get_number_of_gpus();
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
gpu_sks_vec[gpu_index as usize]
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&stream,
);
})
})
});
}
}
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
"oprf",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name: $name:ident) => {
::paste::paste!{
@@ -2772,7 +2686,7 @@ mod cuda {
cuda_trailing_zeros,
cuda_trailing_ones,
cuda_ilog2,
cuda_unsigned_oprf,
oprf::cuda::cuda_unsigned_oprf,
);
criterion_group!(
@@ -2800,7 +2714,7 @@ mod cuda {
cuda_scalar_mul,
cuda_scalar_div,
cuda_scalar_rem,
cuda_unsigned_oprf,
oprf::cuda::cuda_unsigned_oprf,
);
criterion_group!(

View File

@@ -22,12 +22,27 @@ pub fn unsigned_oprf(c: &mut Criterion) {
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id;
let bench_id_oprf;
let bench_id_oprf_bounded;
match get_bench_type() {
BenchmarkType::Latency => {
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
bench_id_oprf = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_id_oprf_bounded =
format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id_oprf, |b| {
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
b.iter(|| {
_ = black_box(sk.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
));
})
});
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
b.iter(|| {
@@ -53,10 +68,25 @@ pub fn unsigned_oprf(c: &mut Criterion) {
);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_id_oprf = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_id_oprf_bounded =
format!("{bench_name}_bounded::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
bench_group.bench_function(&bench_id_oprf, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|_| {
sk.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
);
})
})
});
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|_| {
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
@@ -70,16 +100,161 @@ pub fn unsigned_oprf(c: &mut Criterion) {
}
}
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
"oprf",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
for (bench_id, display_name) in [
(bench_id_oprf, "oprf"),
(bench_id_oprf_bounded, "oprf_bounded"),
] {
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
}
bench_group.finish()
}
#[cfg(feature = "gpu")]
pub mod cuda {
use super::*;
use benchmark::utilities::cuda_integer_utils::cuda_local_keys;
use criterion::black_box;
use std::cmp::max;
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
use tfhe::integer::gpu::server_key::CudaServerKey;
use tfhe::GpuIndex;
use tfhe_csprng::seeders::Seed;
pub fn cuda_unsigned_oprf(c: &mut Criterion) {
let bench_name = "integer::cuda::unsigned_oprf";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id_oprf;
let bench_id_oprf_bounded;
match get_bench_type() {
BenchmarkType::Latency => {
let streams = CudaStreams::new_multi_gpu();
bench_id_oprf = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_id_oprf_bounded =
format!("{bench_name}_bounded::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id_oprf, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
b.iter(|| {
_ = black_box(
gpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
&streams,
),
);
})
});
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
b.iter(|| {
_ = black_box(
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&streams,
),
);
})
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks_vec = cuda_local_keys(&cks);
// Execute the operation once to know its cost.
reset_pbs_count();
cpu_sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id_oprf =
format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_id_oprf_bounded =
format!("{bench_name}_bounded::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id_oprf, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|i| {
let gpu_index: u32 = i as u32 % get_number_of_gpus();
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
gpu_sks_vec[gpu_index as usize]
.par_generate_oblivious_pseudo_random_unsigned_integer(
Seed(0),
num_block as u64,
&stream,
);
})
})
});
bench_group.bench_function(&bench_id_oprf_bounded, |b| {
b.iter(|| {
(0..elements).into_par_iter().for_each(|i| {
let gpu_index: u32 = i as u32 % get_number_of_gpus();
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
gpu_sks_vec[gpu_index as usize]
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&stream,
);
})
})
});
}
}
for (bench_id, display_name) in [
(bench_id_oprf, "oprf"),
(bench_id_oprf_bounded, "oprf_bounded"),
] {
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
}
bench_group.finish()
}
}