Compare commits

...

3 Commits

Author SHA1 Message Date
David Testé
cae938a75b wip: measure latencies of a list of cts 2025-01-28 11:19:27 +01:00
David Testé
bae1d1cf77 WIP: fix gpu streams and use iter_batched 2025-01-22 10:56:08 +01:00
David Testé
a3bc1a9d9e chore(bench): new heuristic to define elements for throughput
This is done to fill up backend with enough elements to fill the
backend and avoid having long execution time for heavy operations
like multiplication or division.
2025-01-20 15:21:05 +01:00
8 changed files with 871 additions and 236 deletions

View File

@@ -282,14 +282,14 @@ check_typos: install_typos_checker
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
check_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
--features=boolean,shortint,integer,internal-keycache,gpu \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
--all-targets \
-p $(TFHE_SPEC)
@@ -394,10 +394,10 @@ clippy_trivium: install_rs_check_toolchain
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
clippy_all_targets: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
@@ -1056,35 +1056,35 @@ bench_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer # Run benchmarks for signed integer
bench_signed_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed-bench \
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
bench_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_integer_compression_gpu
bench_integer_compression_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
@@ -1092,7 +1092,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1100,7 +1100,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed-bench \
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1108,7 +1108,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
.PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1116,14 +1116,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
bench_integer_zk: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench zk-pke-bench \
--features=integer,internal-keycache,zk-pok,nightly-avx512 \
--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
-p $(TFHE_SPEC) --
.PHONY: bench_shortint # Run benchmarks for shortint

File diff suppressed because it is too large Load Diff

View File

@@ -6,6 +6,7 @@ use crate::utilities::{
};
use criterion::{black_box, criterion_group, Criterion, Throughput};
use rayon::prelude::*;
use std::cmp::max;
use tfhe::integer::ciphertext::CompressedCiphertextListBuilder;
use tfhe::integer::{ClientKey, RadixCiphertext};
use tfhe::keycache::NamedParam;
@@ -77,9 +78,19 @@ fn cpu_glwe_packing(c: &mut Criterion) {
});
}
BenchmarkType::Throughput => {
// Execute the operation once to know its cost.
let ct = cks.encrypt_radix(0_u32, num_blocks);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(ct);
let compressed = builder.build(&compression_key);
reset_pbs_count();
let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap();
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
let num_block =
(bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
// FIXME thread usage seemed to be somewhat more "efficient".
// For example, with bit_size = 2, my laptop is only using around 2/3 of the
// available threads Thread usage increases with bit_size = 8 but
@@ -150,6 +161,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
#[cfg(feature = "gpu")]
mod cuda {
use super::*;
use std::cmp::max;
use tfhe::core_crypto::gpu::CudaStreams;
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
@@ -185,27 +197,26 @@ mod cuda {
let bench_id_pack;
let bench_id_unpack;
// Generate private compression key
let cks = ClientKey::new(param);
let private_compression_key = cks.new_compression_private_key(comp_param);
// Generate and convert compression keys
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
let (compressed_compression_key, compressed_decompression_key) =
radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
radix_cks.parameters().glwe_dimension(),
radix_cks.parameters().polynomial_size(),
radix_cks.parameters().message_modulus(),
radix_cks.parameters().carry_modulus(),
radix_cks.parameters().ciphertext_modulus(),
&stream,
);
match BENCH_TYPE.get().unwrap() {
BenchmarkType::Latency => {
// Generate private compression key
let cks = ClientKey::new(param);
let private_compression_key = cks.new_compression_private_key(comp_param);
// Generate and convert compression keys
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
let (compressed_compression_key, compressed_decompression_key) = radix_cks
.new_compressed_compression_decompression_keys(&private_compression_key);
let cuda_compression_key =
compressed_compression_key.decompress_to_cuda(&stream);
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
radix_cks.parameters().glwe_dimension(),
radix_cks.parameters().polynomial_size(),
radix_cks.parameters().message_modulus(),
radix_cks.parameters().carry_modulus(),
radix_cks.parameters().ciphertext_modulus(),
&stream,
);
// Encrypt
let ct = cks.encrypt_radix(0_u32, num_blocks);
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -239,28 +250,25 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
// Execute the operation once to know its cost.
let (cpu_compression_key, cpu_decompression_key) =
cks.new_compression_decompression_keys(&private_compression_key);
let ct = cks.encrypt_radix(0_u32, num_blocks);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(ct);
let compressed = builder.build(&cpu_compression_key);
reset_pbs_count();
// Use CPU operation as pbs_count do not count PBS on GPU backend.
let _: RadixCiphertext =
compressed.get(0, &cpu_decompression_key).unwrap().unwrap();
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
.ceil() as usize;
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
let cks = ClientKey::new(param);
let private_compression_key = cks.new_compression_private_key(comp_param);
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
let (compressed_compression_key, compressed_decompression_key) = radix_cks
.new_compressed_compression_decompression_keys(&private_compression_key);
let cuda_compression_key =
compressed_compression_key.decompress_to_cuda(&stream);
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
radix_cks.parameters().glwe_dimension(),
radix_cks.parameters().polynomial_size(),
radix_cks.parameters().message_modulus(),
radix_cks.parameters().carry_modulus(),
radix_cks.parameters().ciphertext_modulus(),
&stream,
);
// Encrypt
let ct = cks.encrypt_radix(0_u32, num_blocks);
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -344,6 +352,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
#[cfg(feature = "gpu")]
use cuda::gpu_glwe_packing2;
use tfhe::{get_pbs_count, reset_pbs_count};
fn main() {
BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap());

View File

@@ -4,9 +4,11 @@ use crate::utilities::{
};
use criterion::{black_box, Criterion, Throughput};
use rayon::prelude::*;
use std::cmp::max;
use tfhe::integer::keycache::KEY_CACHE;
use tfhe::integer::IntegerKeyKind;
use tfhe::keycache::NamedParam;
use tfhe::{get_pbs_count, reset_pbs_count};
use tfhe_csprng::seeders::Seed;
pub fn unsigned_oprf(c: &mut Criterion) {
@@ -40,12 +42,21 @@ pub fn unsigned_oprf(c: &mut Criterion) {
});
}
BenchmarkType::Throughput => {
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
reset_pbs_count();
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
b.iter(|| {
(0..elements).into_par_iter().for_each(|_| {
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(

View File

@@ -8,6 +8,7 @@ use crate::utilities::{
use criterion::{criterion_group, Criterion, Throughput};
use rand::prelude::*;
use rayon::prelude::*;
use std::cmp::max;
use std::env;
use tfhe::integer::keycache::KEY_CACHE;
use tfhe::integer::prelude::*;
@@ -66,12 +67,20 @@ fn bench_server_key_signed_binary_function_clean_inputs<F>(
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
reset_pbs_count();
binary_op(&sks, &ct_0, &ct_1);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let mut cts_0 = (0..elements)
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
.collect::<Vec<_>>();
@@ -151,12 +160,21 @@ fn bench_server_key_signed_shift_function_clean_inputs<F>(
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
let clear_1 = rng.gen_range(0u128..bit_size as u128);
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let ct_1 = cks.encrypt_radix(clear_1, num_block);
reset_pbs_count();
binary_op(&sks, &ct_0, &ct_1);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let mut cts_0 = (0..elements)
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
.collect::<Vec<_>>();
@@ -233,12 +251,19 @@ fn bench_server_key_unary_function_clean_inputs<F>(
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
reset_pbs_count();
unary_fn(&sks, &ct_0);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let mut cts_0 = (0..elements)
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
.collect::<Vec<_>>();
@@ -307,12 +332,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) {
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
reset_pbs_count();
sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let cts_cond = (0..elements)
.map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
.collect::<Vec<_>>();
@@ -830,12 +864,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let clear_1 = rng_func(&mut rng, bit_size);
reset_pbs_count();
binary_op(&sks, &mut ct_0, clear_1);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let mut cts_0 = (0..elements)
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
.collect::<Vec<_>>();
@@ -1328,6 +1370,7 @@ mod cuda {
use super::*;
use criterion::criterion_group;
use rayon::iter::IntoParallelRefIterator;
use std::cmp::max;
use tfhe::core_crypto::gpu::CudaStreams;
use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
@@ -1335,11 +1378,12 @@ mod cuda {
/// Base function to bench a server key function that is a binary operation, input ciphertext
/// will contain only zero carries
fn bench_cuda_server_key_binary_signed_function_clean_inputs<F>(
fn bench_cuda_server_key_binary_signed_function_clean_inputs<F, G>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
binary_op: F,
binary_op_cpu: G,
) where
F: Fn(
&CudaServerKey,
@@ -1347,6 +1391,7 @@ mod cuda {
&mut CudaSignedRadixCiphertext,
&CudaStreams,
) + Sync,
G: Fn(&ServerKey, &SignedRadixCiphertext, &SignedRadixCiphertext) + Sync,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
@@ -1401,14 +1446,22 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
// Execute the operation once to know its cost.
let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let mut ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
reset_pbs_count();
// Use CPU operation as pbs_count do not count PBS on GPU backend.
binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let mut cts_0 = (0..elements)
.map(|_| {
let clearlow = rng.gen::<u128>();
@@ -1460,7 +1513,7 @@ mod cuda {
}
macro_rules! define_cuda_server_key_bench_clean_input_signed_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_binary_signed_function_clean_inputs(
@@ -1469,6 +1522,9 @@ mod cuda {
stringify!($name),
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
},
|server_key_cpu, lhs, rhs| {
server_key_cpu.$server_key_method_cpu(lhs, rhs);
}
)
}
@@ -1478,13 +1534,15 @@ mod cuda {
/// Base function to bench a server key function that is a unary operation, input ciphertext
/// will contain only zero carries
fn bench_cuda_server_key_unary_signed_function_clean_inputs<F>(
fn bench_cuda_server_key_unary_signed_function_clean_inputs<F, G>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
unary_op: F,
unary_op_cpu: G,
) where
F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, &CudaStreams) + Sync,
G: Fn(&ServerKey, &SignedRadixCiphertext) + Sync,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
@@ -1527,14 +1585,21 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
// Execute the operation once to know its cost.
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
reset_pbs_count();
// Use CPU operation as pbs_count do not count PBS on GPU backend.
unary_op_cpu(&cpu_sks, &ct_0);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let mut cts_0 = (0..elements)
.map(|_| {
let clearlow = rng.gen::<u128>();
@@ -1572,7 +1637,7 @@ mod cuda {
}
macro_rules! define_cuda_server_key_bench_clean_input_signed_unary_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_unary_signed_function_clean_inputs(
@@ -1581,6 +1646,9 @@ mod cuda {
stringify!($name),
|server_key, input, stream| {
server_key.$server_key_method(input, stream);
},
|server_key_cpu, lhs| {
server_key_cpu.$server_key_method_cpu(lhs);
}
)
}
@@ -1588,15 +1656,17 @@ mod cuda {
}
);
fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G>(
fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G, H>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
binary_op: F,
rng_func: G,
binary_op_cpu: G,
rng_func: H,
) where
F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, ScalarType, &CudaStreams) + Sync,
G: Fn(&mut ThreadRng, usize) -> ScalarType,
G: Fn(&ServerKey, &mut SignedRadixCiphertext, ScalarType) + Sync,
H: Fn(&mut ThreadRng, usize) -> ScalarType,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
@@ -1650,16 +1720,24 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
// Execute the operation once to know its cost.
let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let clear_0 = rng_func(&mut rng, bit_size);
reset_pbs_count();
// Use CPU operation as pbs_count do not count PBS on GPU backend.
binary_op_cpu(&cpu_sks, &mut ct_0, clear_0);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!(
"{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
);
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let mut cts_0 = (0..elements)
.map(|_| {
let clearlow = rng.gen::<u128>();
@@ -1702,7 +1780,7 @@ mod cuda {
}
macro_rules! define_cuda_server_key_bench_clean_input_scalar_signed_fn (
(method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_binary_scalar_signed_function_clean_inputs(
@@ -1712,6 +1790,9 @@ mod cuda {
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
},
|server_key_cpu, lhs, rhs| {
server_key_cpu.$server_key_method_cpu(lhs, rhs);
},
$($rng_fn)*
)
}
@@ -1721,11 +1802,12 @@ mod cuda {
/// Base function to bench a server key function that is a binary operation for shift/rotate,
/// input ciphertext will contain only zero carries
fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs<F>(
fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs<F, G>(
c: &mut Criterion,
bench_name: &str,
display_name: &str,
binary_op: F,
binary_op_cpu: G,
) where
F: Fn(
&CudaServerKey,
@@ -1733,6 +1815,7 @@ mod cuda {
&mut CudaUnsignedRadixCiphertext,
&CudaStreams,
) + Sync,
G: Fn(&ServerKey, &SignedRadixCiphertext, &RadixCiphertext) + Sync,
{
let mut bench_group = c.benchmark_group(bench_name);
bench_group
@@ -1786,14 +1869,23 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
// Execute the operation once to know its cost.
let clear_1 = rng.gen_range(0u128..bit_size as u128);
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let ct_1 = cks.encrypt_radix(clear_1, num_block);
reset_pbs_count();
// Use CPU operation as pbs_count do not count PBS on GPU backend.
binary_op_cpu(&cpu_sks, &ct_0, &ct_1);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let mut cts_0 = (0..elements)
.map(|_| {
let clearlow = rng.gen::<u128>();
@@ -1843,7 +1935,7 @@ mod cuda {
}
macro_rules! define_cuda_server_key_bench_clean_input_signed_shift_rotate (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
::paste::paste!{
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
bench_cuda_server_key_shift_rotate_signed_function_clean_inputs(
@@ -1852,6 +1944,9 @@ mod cuda {
stringify!($name),
|server_key, lhs, rhs, stream| {
server_key.$server_key_method(lhs, rhs, stream);
},
|server_key_cpu, lhs, rhs| {
server_key_cpu.$server_key_method_cpu(lhs, rhs);
}
)
}
@@ -1916,14 +2011,23 @@ mod cuda {
});
}
BenchmarkType::Throughput => {
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
// Execute the operation once to know its cost.
let cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5));
let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
reset_pbs_count();
// Use CPU operation as pbs_count do not count PBS on GPU backend.
cpu_sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);
let cts_cond = (0..elements)
.map(|_| {
let ct_cond = cks.encrypt_bool(rng.gen::<bool>());
@@ -1997,246 +2101,291 @@ mod cuda {
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_add,
method_name_cpu: unchecked_add_parallelized,
display_name: add
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_sub,
method_name_cpu: unchecked_sub,
display_name: sub
);
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: unchecked_neg,
method_name_cpu: unchecked_neg,
display_name: neg
);
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: unchecked_abs,
method_name_cpu: unchecked_abs_parallelized,
display_name: abs
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_mul,
method_name_cpu: unchecked_mul_parallelized,
display_name: mul
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_div_rem,
method_name_cpu: unchecked_div_rem_parallelized,
display_name: div_mod
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_bitand,
method_name_cpu: unchecked_bitand_parallelized,
display_name: bitand
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_bitor,
method_name_cpu: unchecked_bitor_parallelized,
display_name: bitor
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_bitxor,
method_name_cpu: unchecked_bitxor_parallelized,
display_name: bitxor
);
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: unchecked_bitnot,
method_name_cpu: bitnot,
display_name: bitnot
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: unchecked_rotate_left,
method_name_cpu: unchecked_rotate_left_parallelized,
display_name: rotate_left
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: unchecked_rotate_right,
method_name_cpu: unchecked_rotate_right_parallelized,
display_name: rotate_right
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: unchecked_left_shift,
method_name_cpu: unchecked_left_shift_parallelized,
display_name: left_shift
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: unchecked_right_shift,
method_name_cpu: unchecked_right_shift_parallelized,
display_name: right_shift
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_eq,
method_name_cpu: unchecked_eq_parallelized,
display_name: eq
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_ne,
method_name_cpu: unchecked_ne_parallelized,
display_name: ne
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_gt,
method_name_cpu: unchecked_gt_parallelized,
display_name: gt
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_ge,
method_name_cpu: unchecked_ge_parallelized,
display_name: ge
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_lt,
method_name_cpu: unchecked_lt_parallelized,
display_name: lt
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_le,
method_name_cpu: unchecked_le_parallelized,
display_name: le
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_min,
method_name_cpu: unchecked_min_parallelized,
display_name: min
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_max,
method_name_cpu: unchecked_max_parallelized,
display_name: max
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_signed_overflowing_add,
method_name_cpu: unchecked_signed_overflowing_add_parallelized,
display_name: overflowing_add
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: unchecked_signed_overflowing_sub,
method_name_cpu: unchecked_signed_overflowing_sub_parallelized,
display_name: overflowing_sub
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_add,
method_name_cpu: unchecked_scalar_add,
display_name: add,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_mul,
method_name_cpu: unchecked_scalar_mul_parallelized,
display_name: mul,
rng_func: mul_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_sub,
method_name_cpu: unchecked_scalar_sub,
display_name: sub,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_bitand,
method_name_cpu: unchecked_scalar_bitand_parallelized,
display_name: bitand,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_bitor,
method_name_cpu: unchecked_scalar_bitor_parallelized,
display_name: bitor,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_bitxor,
method_name_cpu: unchecked_scalar_bitxor_parallelized,
display_name: bitxor,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_right_shift,
method_name_cpu: unchecked_scalar_right_shift_parallelized,
display_name: right_shift,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_left_shift,
method_name_cpu: unchecked_scalar_left_shift_parallelized,
display_name: left_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_rotate_right,
method_name_cpu: unchecked_scalar_rotate_right_parallelized,
display_name: rotate_right,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_rotate_left,
method_name_cpu: unchecked_scalar_rotate_left_parallelized,
display_name: rotate_left,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_eq,
method_name_cpu: unchecked_scalar_eq_parallelized,
display_name: eq,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_ne,
method_name_cpu: unchecked_scalar_ne_parallelized,
display_name: ne,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_gt,
method_name_cpu: unchecked_scalar_gt_parallelized,
display_name: gt,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_ge,
method_name_cpu: unchecked_scalar_ge_parallelized,
display_name: ge,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_lt,
method_name_cpu: unchecked_scalar_lt_parallelized,
display_name: lt,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_le,
method_name_cpu: unchecked_scalar_le_parallelized,
display_name: le,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_min,
method_name_cpu: unchecked_scalar_min_parallelized,
display_name: min,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_scalar_max,
method_name_cpu: unchecked_scalar_max_parallelized,
display_name: max,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: signed_overflowing_scalar_add,
method_name_cpu: signed_overflowing_scalar_add_parallelized,
display_name: overflowing_add,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: signed_overflowing_scalar_sub,
method_name_cpu: signed_overflowing_scalar_sub_parallelized,
display_name: overflowing_sub,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: unchecked_signed_scalar_div_rem,
method_name_cpu: unchecked_signed_scalar_div_rem_parallelized,
display_name: div_rem,
rng_func: div_scalar
);
@@ -2247,234 +2396,277 @@ mod cuda {
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: add,
method_name_cpu: add_parallelized,
display_name: add
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: sub,
method_name_cpu: sub_parallelized,
display_name: sub
);
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: neg,
method_name_cpu: neg_parallelized,
display_name: neg
);
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: abs,
method_name_cpu: abs_parallelized,
display_name: abs
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: mul,
method_name_cpu: mul_parallelized,
display_name: mul
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: div_rem,
method_name_cpu: div_rem_parallelized,
display_name: div_mod
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: bitand,
method_name_cpu: bitand_parallelized,
display_name: bitand
);
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
method_name: bitnot,
method_name_cpu: bitnot,
display_name: bitnot
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: bitor,
method_name_cpu: bitor_parallelized,
display_name: bitor
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: bitxor,
method_name_cpu: bitxor_parallelized,
display_name: bitxor
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: rotate_left,
method_name_cpu: rotate_left_parallelized,
display_name: rotate_left
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: rotate_right,
method_name_cpu: rotate_right_parallelized,
display_name: rotate_right
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: left_shift,
method_name_cpu: left_shift_parallelized,
display_name: left_shift
);
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
method_name: right_shift,
method_name_cpu: right_shift_parallelized,
display_name: right_shift
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: eq,
method_name_cpu: eq_parallelized,
display_name: eq
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: ne,
method_name_cpu: ne_parallelized,
display_name: ne
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: gt,
method_name_cpu: gt_parallelized,
display_name: gt
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: ge,
method_name_cpu: ge_parallelized,
display_name: ge
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: lt,
method_name_cpu: lt_parallelized,
display_name: lt
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: le,
method_name_cpu: le_parallelized,
display_name: le
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: min,
method_name_cpu: min_parallelized,
display_name: min
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: max,
method_name_cpu: max_parallelized,
display_name: max
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: signed_overflowing_add,
method_name_cpu: signed_overflowing_add_parallelized,
display_name: overflowing_add
);
define_cuda_server_key_bench_clean_input_signed_fn!(
method_name: signed_overflowing_sub,
method_name_cpu: signed_overflowing_sub_parallelized,
display_name: overflowing_sub
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_add,
method_name_cpu: scalar_add_parallelized,
display_name: add,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_mul,
method_name_cpu: scalar_mul_parallelized,
display_name: mul,
rng_func: mul_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_sub,
method_name_cpu: scalar_sub_parallelized,
display_name: sub,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_bitand,
method_name_cpu: scalar_bitand_parallelized,
display_name: bitand,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_bitor,
method_name_cpu: scalar_bitor_parallelized,
display_name: bitor,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_bitxor,
method_name_cpu: scalar_bitxor_parallelized,
display_name: bitxor,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_left_shift,
method_name_cpu: scalar_left_shift_parallelized,
display_name: left_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_right_shift,
method_name_cpu: scalar_right_shift_parallelized,
display_name: right_shift,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_rotate_left,
method_name_cpu: scalar_rotate_left_parallelized,
display_name: rotate_left,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_rotate_right,
method_name_cpu: scalar_rotate_right_parallelized,
display_name: rotate_right,
rng_func: shift_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_eq,
method_name_cpu: scalar_eq_parallelized,
display_name: eq,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_ne,
method_name_cpu: scalar_ne_parallelized,
display_name: ne,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_gt,
method_name_cpu: scalar_gt_parallelized,
display_name: gt,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_ge,
method_name_cpu: scalar_ge_parallelized,
display_name: ge,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_lt,
method_name_cpu: scalar_lt_parallelized,
display_name: lt,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_le,
method_name_cpu: scalar_le_parallelized,
display_name: le,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_min,
method_name_cpu: scalar_min_parallelized,
display_name: min,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: scalar_max,
method_name_cpu: scalar_max_parallelized,
display_name: max,
rng_func: default_signed_scalar
);
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
method_name: signed_scalar_div_rem,
method_name_cpu: signed_scalar_div_rem_parallelized,
display_name: div_rem,
rng_func: div_scalar
);
@@ -2697,6 +2889,7 @@ use cuda::{
cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops,
unchecked_cuda_ops, unchecked_scalar_cuda_ops,
};
use tfhe::{get_pbs_count, reset_pbs_count};
#[cfg(feature = "gpu")]
fn go_through_gpu_bench_groups(val: &str) {

View File

@@ -5,6 +5,7 @@ use crate::utilities::{throughput_num_threads, BenchmarkType, BENCH_TYPE};
use criterion::{criterion_group, Criterion, Throughput};
use rand::prelude::*;
use rayon::prelude::*;
use std::cmp::max;
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::path::Path;
@@ -24,6 +25,7 @@ use tfhe::shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::{
};
use tfhe::shortint::parameters::PBSParameters;
use tfhe::zk::{CompactPkeCrs, ZkComputeLoad};
use tfhe::{get_pbs_count, reset_pbs_count};
use utilities::{write_to_json, OperatorType};
fn write_result(file: &mut File, name: &str, value: usize) {
@@ -112,7 +114,17 @@ fn pke_zk_proof(c: &mut Criterion) {
});
}
BenchmarkType::Throughput => {
let elements = throughput_num_threads(num_block);
// Execute the operation once to know its cost.
let input_msg = rng.gen::<u64>();
let messages = vec![input_msg; fhe_uint_count];
reset_pbs_count();
let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
.extend(messages.iter().copied())
.build_with_proof_packed(&crs, &metadata, compute_load);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_id = format!(
@@ -330,7 +342,27 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) {
}
BenchmarkType::Throughput => {
// In throughput mode object sizes are not recorded.
let elements = throughput_num_threads(num_block);
// Execute the operation once to know its cost.
let input_msg = rng.gen::<u64>();
let messages = vec![input_msg; fhe_uint_count];
let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
.extend(messages.iter().copied())
.build_with_proof_packed(&crs, &metadata, compute_load)
.unwrap();
reset_pbs_count();
let _ = ct1.verify_and_expand(
&crs,
&pk,
&metadata,
IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary(
casting_key.as_view(),
),
);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_id_verify = format!(

View File

@@ -389,29 +389,43 @@ pub mod integer_utils {
use super::*;
use std::sync::OnceLock;
#[cfg(feature = "gpu")]
use tfhe_cuda_backend::cuda_bind::cuda_get_number_of_gpus;
use tfhe::core_crypto::gpu::get_number_of_gpus;
/// Generate a number of threads to use to saturate current machine for throughput measurements.
#[allow(dead_code)]
pub fn throughput_num_threads(num_block: usize) -> u64 {
pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil();
let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0);
// Some operations with a high count of PBS (e.g. division) would yield an operation
// loading value so low that the number of elements in the end wouldn't be meaningful.
let minimum_loading = if num_block < 64 { 0.2 } else { 0.1 };
#[cfg(feature = "gpu")]
{
// This value is for Nvidia H100 GPU
let streaming_multiprocessors = 132;
let num_gpus = unsafe { cuda_get_number_of_gpus() };
((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64
let total_num_sm = streaming_multiprocessors * get_number_of_gpus();
let operation_loading =
((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
(total_num_sm as f64 * block_multiplicator * operation_loading) as u64
}
#[cfg(not(feature = "gpu"))]
{
let num_threads = rayon::current_num_threads() as f64;
let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);
// Add 20% more to maximum threads available.
((num_threads + (num_threads * 0.2)) * block_multiplicator) as u64
((num_threads + (num_threads * 0.2)) * block_multiplicator.min(1.0) * operation_loading)
as u64
}
}
/// Get number of streams usable for CUDA throughput benchmarks
#[allow(dead_code)]
#[cfg(feature = "gpu")]
pub fn cuda_num_streams(num_block: usize) -> u64 {
((192 / num_block) * get_number_of_gpus() as usize) as u64
}
#[allow(dead_code)]
pub static BENCH_TYPE: OnceLock<BenchmarkType> = OnceLock::new();

View File

@@ -172,7 +172,7 @@ impl CudaBooleanBlock {
})
}
pub(crate) fn duplicate(&self, streams: &CudaStreams) -> Self {
pub fn duplicate(&self, streams: &CudaStreams) -> Self {
let ct = unsafe { self.duplicate_async(streams) };
streams.synchronize();
ct