mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-13 16:47:59 -05:00
Compare commits
3 Commits
pa/paralle
...
dt/bench/f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cae938a75b | ||
|
|
bae1d1cf77 | ||
|
|
a3bc1a9d9e |
28
Makefile
28
Makefile
@@ -282,14 +282,14 @@ check_typos: install_typos_checker
|
||||
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
|
||||
clippy_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
|
||||
check_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu \
|
||||
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
|
||||
--all-targets \
|
||||
-p $(TFHE_SPEC)
|
||||
|
||||
@@ -394,10 +394,10 @@ clippy_trivium: install_rs_check_toolchain
|
||||
.PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
|
||||
clippy_all_targets: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
|
||||
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
|
||||
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
|
||||
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \
|
||||
-p $(TFHE_SPEC) -- --no-deps -D warnings
|
||||
|
||||
.PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
|
||||
@@ -1056,35 +1056,35 @@ bench_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer # Run benchmarks for signed integer
|
||||
bench_signed_integer: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
|
||||
bench_integer_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||
bench_integer_compression: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_compression_gpu
|
||||
bench_integer_compression_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench glwe_packing_compression-integer-bench \
|
||||
--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
|
||||
--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
|
||||
bench_integer_multi_bit: install_rs_check_toolchain
|
||||
@@ -1092,7 +1092,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
|
||||
bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
@@ -1100,7 +1100,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-signed-bench \
|
||||
--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
|
||||
bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
@@ -1108,7 +1108,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
|
||||
bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
@@ -1116,14 +1116,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
|
||||
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned
|
||||
|
||||
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
|
||||
bench_integer_zk: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench zk-pke-bench \
|
||||
--features=integer,internal-keycache,zk-pok,nightly-avx512 \
|
||||
--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
|
||||
-p $(TFHE_SPEC) --
|
||||
|
||||
.PHONY: bench_shortint # Run benchmarks for shortint
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@ use crate::utilities::{
|
||||
};
|
||||
use criterion::{black_box, criterion_group, Criterion, Throughput};
|
||||
use rayon::prelude::*;
|
||||
use std::cmp::max;
|
||||
use tfhe::integer::ciphertext::CompressedCiphertextListBuilder;
|
||||
use tfhe::integer::{ClientKey, RadixCiphertext};
|
||||
use tfhe::keycache::NamedParam;
|
||||
@@ -77,9 +78,19 @@ fn cpu_glwe_packing(c: &mut Criterion) {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
// Execute the operation once to know its cost.
|
||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||
let mut builder = CompressedCiphertextListBuilder::new();
|
||||
builder.push(ct);
|
||||
let compressed = builder.build(&compression_key);
|
||||
|
||||
reset_pbs_count();
|
||||
let _: RadixCiphertext = compressed.get(0, &decompression_key).unwrap().unwrap();
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
let num_block =
|
||||
(bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
// FIXME thread usage seemed to be somewhat more "efficient".
|
||||
// For example, with bit_size = 2, my laptop is only using around 2/3 of the
|
||||
// available threads Thread usage increases with bit_size = 8 but
|
||||
@@ -150,6 +161,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
|
||||
#[cfg(feature = "gpu")]
|
||||
mod cuda {
|
||||
use super::*;
|
||||
use std::cmp::max;
|
||||
use tfhe::core_crypto::gpu::CudaStreams;
|
||||
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
|
||||
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
|
||||
@@ -185,27 +197,26 @@ mod cuda {
|
||||
let bench_id_pack;
|
||||
let bench_id_unpack;
|
||||
|
||||
// Generate private compression key
|
||||
let cks = ClientKey::new(param);
|
||||
let private_compression_key = cks.new_compression_private_key(comp_param);
|
||||
|
||||
// Generate and convert compression keys
|
||||
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||
let (compressed_compression_key, compressed_decompression_key) =
|
||||
radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||
let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
|
||||
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
|
||||
radix_cks.parameters().glwe_dimension(),
|
||||
radix_cks.parameters().polynomial_size(),
|
||||
radix_cks.parameters().message_modulus(),
|
||||
radix_cks.parameters().carry_modulus(),
|
||||
radix_cks.parameters().ciphertext_modulus(),
|
||||
&stream,
|
||||
);
|
||||
|
||||
match BENCH_TYPE.get().unwrap() {
|
||||
BenchmarkType::Latency => {
|
||||
// Generate private compression key
|
||||
let cks = ClientKey::new(param);
|
||||
let private_compression_key = cks.new_compression_private_key(comp_param);
|
||||
|
||||
// Generate and convert compression keys
|
||||
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||
let (compressed_compression_key, compressed_decompression_key) = radix_cks
|
||||
.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||
let cuda_compression_key =
|
||||
compressed_compression_key.decompress_to_cuda(&stream);
|
||||
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
|
||||
radix_cks.parameters().glwe_dimension(),
|
||||
radix_cks.parameters().polynomial_size(),
|
||||
radix_cks.parameters().message_modulus(),
|
||||
radix_cks.parameters().carry_modulus(),
|
||||
radix_cks.parameters().ciphertext_modulus(),
|
||||
&stream,
|
||||
);
|
||||
|
||||
// Encrypt
|
||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||
@@ -239,28 +250,25 @@ mod cuda {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
// Execute the operation once to know its cost.
|
||||
let (cpu_compression_key, cpu_decompression_key) =
|
||||
cks.new_compression_decompression_keys(&private_compression_key);
|
||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||
let mut builder = CompressedCiphertextListBuilder::new();
|
||||
builder.push(ct);
|
||||
let compressed = builder.build(&cpu_compression_key);
|
||||
|
||||
reset_pbs_count();
|
||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
||||
let _: RadixCiphertext =
|
||||
compressed.get(0, &cpu_decompression_key).unwrap().unwrap();
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
|
||||
.ceil() as usize;
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
|
||||
let cks = ClientKey::new(param);
|
||||
let private_compression_key = cks.new_compression_private_key(comp_param);
|
||||
|
||||
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||
let (compressed_compression_key, compressed_decompression_key) = radix_cks
|
||||
.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||
let cuda_compression_key =
|
||||
compressed_compression_key.decompress_to_cuda(&stream);
|
||||
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
|
||||
radix_cks.parameters().glwe_dimension(),
|
||||
radix_cks.parameters().polynomial_size(),
|
||||
radix_cks.parameters().message_modulus(),
|
||||
radix_cks.parameters().carry_modulus(),
|
||||
radix_cks.parameters().ciphertext_modulus(),
|
||||
&stream,
|
||||
);
|
||||
|
||||
// Encrypt
|
||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||
@@ -344,6 +352,7 @@ criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use cuda::gpu_glwe_packing2;
|
||||
use tfhe::{get_pbs_count, reset_pbs_count};
|
||||
|
||||
fn main() {
|
||||
BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap());
|
||||
|
||||
@@ -4,9 +4,11 @@ use crate::utilities::{
|
||||
};
|
||||
use criterion::{black_box, Criterion, Throughput};
|
||||
use rayon::prelude::*;
|
||||
use std::cmp::max;
|
||||
use tfhe::integer::keycache::KEY_CACHE;
|
||||
use tfhe::integer::IntegerKeyKind;
|
||||
use tfhe::keycache::NamedParam;
|
||||
use tfhe::{get_pbs_count, reset_pbs_count};
|
||||
use tfhe_csprng::seeders::Seed;
|
||||
|
||||
pub fn unsigned_oprf(c: &mut Criterion) {
|
||||
@@ -40,12 +42,21 @@ pub fn unsigned_oprf(c: &mut Criterion) {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
reset_pbs_count();
|
||||
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
Seed(0),
|
||||
bit_size as u64,
|
||||
num_block as u64,
|
||||
);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (_, sk) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
b.iter(|| {
|
||||
(0..elements).into_par_iter().for_each(|_| {
|
||||
sk.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::utilities::{
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use rand::prelude::*;
|
||||
use rayon::prelude::*;
|
||||
use std::cmp::max;
|
||||
use std::env;
|
||||
use tfhe::integer::keycache::KEY_CACHE;
|
||||
use tfhe::integer::prelude::*;
|
||||
@@ -66,12 +67,20 @@ fn bench_server_key_signed_binary_function_clean_inputs<F>(
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
binary_op(&sks, &ct_0, &ct_1);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -151,12 +160,21 @@ fn bench_server_key_signed_shift_function_clean_inputs<F>(
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let clear_1 = rng.gen_range(0u128..bit_size as u128);
|
||||
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let ct_1 = cks.encrypt_radix(clear_1, num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
binary_op(&sks, &ct_0, &ct_1);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -233,12 +251,19 @@ fn bench_server_key_unary_function_clean_inputs<F>(
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
unary_fn(&sks, &ct_0);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -307,12 +332,21 @@ fn signed_if_then_else_parallelized(c: &mut Criterion) {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let cond = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
|
||||
let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
let cts_cond = (0..elements)
|
||||
.map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -830,12 +864,20 @@ fn bench_server_key_binary_scalar_function_clean_inputs<F, G>(
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let clear_1 = rng_func(&mut rng, bit_size);
|
||||
|
||||
reset_pbs_count();
|
||||
binary_op(&sks, &mut ct_0, clear_1);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -1328,6 +1370,7 @@ mod cuda {
|
||||
use super::*;
|
||||
use criterion::criterion_group;
|
||||
use rayon::iter::IntoParallelRefIterator;
|
||||
use std::cmp::max;
|
||||
use tfhe::core_crypto::gpu::CudaStreams;
|
||||
use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
|
||||
@@ -1335,11 +1378,12 @@ mod cuda {
|
||||
|
||||
/// Base function to bench a server key function that is a binary operation, input ciphertext
|
||||
/// will contain only zero carries
|
||||
fn bench_cuda_server_key_binary_signed_function_clean_inputs<F>(
|
||||
fn bench_cuda_server_key_binary_signed_function_clean_inputs<F, G>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
binary_op: F,
|
||||
binary_op_cpu: G,
|
||||
) where
|
||||
F: Fn(
|
||||
&CudaServerKey,
|
||||
@@ -1347,6 +1391,7 @@ mod cuda {
|
||||
&mut CudaSignedRadixCiphertext,
|
||||
&CudaStreams,
|
||||
) + Sync,
|
||||
G: Fn(&ServerKey, &SignedRadixCiphertext, &SignedRadixCiphertext) + Sync,
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
@@ -1401,14 +1446,22 @@ mod cuda {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let mut ct_1 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
||||
binary_op_cpu(&cpu_sks, &mut ct_0, &mut ct_1);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
@@ -1460,7 +1513,7 @@ mod cuda {
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_signed_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_binary_signed_function_clean_inputs(
|
||||
@@ -1469,6 +1522,9 @@ mod cuda {
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
},
|
||||
|server_key_cpu, lhs, rhs| {
|
||||
server_key_cpu.$server_key_method_cpu(lhs, rhs);
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -1478,13 +1534,15 @@ mod cuda {
|
||||
|
||||
/// Base function to bench a server key function that is a unary operation, input ciphertext
|
||||
/// will contain only zero carries
|
||||
fn bench_cuda_server_key_unary_signed_function_clean_inputs<F>(
|
||||
fn bench_cuda_server_key_unary_signed_function_clean_inputs<F, G>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
unary_op: F,
|
||||
unary_op_cpu: G,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, &CudaStreams) + Sync,
|
||||
G: Fn(&ServerKey, &SignedRadixCiphertext) + Sync,
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
@@ -1527,14 +1585,21 @@ mod cuda {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
||||
unary_op_cpu(&cpu_sks, &ct_0);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
@@ -1572,7 +1637,7 @@ mod cuda {
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_signed_unary_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_unary_signed_function_clean_inputs(
|
||||
@@ -1581,6 +1646,9 @@ mod cuda {
|
||||
stringify!($name),
|
||||
|server_key, input, stream| {
|
||||
server_key.$server_key_method(input, stream);
|
||||
},
|
||||
|server_key_cpu, lhs| {
|
||||
server_key_cpu.$server_key_method_cpu(lhs);
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -1588,15 +1656,17 @@ mod cuda {
|
||||
}
|
||||
);
|
||||
|
||||
fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G>(
|
||||
fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G, H>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
binary_op: F,
|
||||
rng_func: G,
|
||||
binary_op_cpu: G,
|
||||
rng_func: H,
|
||||
) where
|
||||
F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, ScalarType, &CudaStreams) + Sync,
|
||||
G: Fn(&mut ThreadRng, usize) -> ScalarType,
|
||||
G: Fn(&ServerKey, &mut SignedRadixCiphertext, ScalarType) + Sync,
|
||||
H: Fn(&mut ThreadRng, usize) -> ScalarType,
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
@@ -1650,16 +1720,24 @@ mod cuda {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let mut ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let clear_0 = rng_func(&mut rng, bit_size);
|
||||
|
||||
reset_pbs_count();
|
||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
||||
binary_op_cpu(&cpu_sks, &mut ct_0, clear_0);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!(
|
||||
"{bench_name}::throughput::{param_name}::{bit_size}_bits_scalar_{bit_size}"
|
||||
);
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
@@ -1702,7 +1780,7 @@ mod cuda {
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_scalar_signed_fn (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
|
||||
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_binary_scalar_signed_function_clean_inputs(
|
||||
@@ -1712,6 +1790,9 @@ mod cuda {
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
},
|
||||
|server_key_cpu, lhs, rhs| {
|
||||
server_key_cpu.$server_key_method_cpu(lhs, rhs);
|
||||
},
|
||||
$($rng_fn)*
|
||||
)
|
||||
}
|
||||
@@ -1721,11 +1802,12 @@ mod cuda {
|
||||
|
||||
/// Base function to bench a server key function that is a binary operation for shift/rotate,
|
||||
/// input ciphertext will contain only zero carries
|
||||
fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs<F>(
|
||||
fn bench_cuda_server_key_shift_rotate_signed_function_clean_inputs<F, G>(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
binary_op: F,
|
||||
binary_op_cpu: G,
|
||||
) where
|
||||
F: Fn(
|
||||
&CudaServerKey,
|
||||
@@ -1733,6 +1815,7 @@ mod cuda {
|
||||
&mut CudaUnsignedRadixCiphertext,
|
||||
&CudaStreams,
|
||||
) + Sync,
|
||||
G: Fn(&ServerKey, &SignedRadixCiphertext, &RadixCiphertext) + Sync,
|
||||
{
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
@@ -1786,14 +1869,23 @@ mod cuda {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let clear_1 = rng.gen_range(0u128..bit_size as u128);
|
||||
let ct_0 = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let ct_1 = cks.encrypt_radix(clear_1, num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
||||
binary_op_cpu(&cpu_sks, &ct_0, &ct_1);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let mut cts_0 = (0..elements)
|
||||
.map(|_| {
|
||||
let clearlow = rng.gen::<u128>();
|
||||
@@ -1843,7 +1935,7 @@ mod cuda {
|
||||
}
|
||||
|
||||
macro_rules! define_cuda_server_key_bench_clean_input_signed_shift_rotate (
|
||||
(method_name: $server_key_method:ident, display_name:$name:ident) => {
|
||||
(method_name: $server_key_method:ident, method_name_cpu: $server_key_method_cpu:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [<cuda_ $server_key_method>](c: &mut Criterion) {
|
||||
bench_cuda_server_key_shift_rotate_signed_function_clean_inputs(
|
||||
@@ -1852,6 +1944,9 @@ mod cuda {
|
||||
stringify!($name),
|
||||
|server_key, lhs, rhs, stream| {
|
||||
server_key.$server_key_method(lhs, rhs, stream);
|
||||
},
|
||||
|server_key_cpu, lhs, rhs| {
|
||||
server_key_cpu.$server_key_method_cpu(lhs, rhs);
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -1916,14 +2011,23 @@ mod cuda {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let (cks, cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let cond = cpu_sks.create_trivial_boolean_block(rng.gen_bool(0.5));
|
||||
let ct_then = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
let ct_else = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_block);
|
||||
|
||||
reset_pbs_count();
|
||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
||||
cpu_sks.if_then_else_parallelized(&cond, &ct_then, &ct_else);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
let elements = throughput_num_threads(num_block);
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _cpu_sks) =
|
||||
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let gpu_sks = CudaServerKey::new(&cks, &stream);
|
||||
|
||||
let cts_cond = (0..elements)
|
||||
.map(|_| {
|
||||
let ct_cond = cks.encrypt_bool(rng.gen::<bool>());
|
||||
@@ -1997,246 +2101,291 @@ mod cuda {
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_add,
|
||||
method_name_cpu: unchecked_add_parallelized,
|
||||
display_name: add
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_sub,
|
||||
method_name_cpu: unchecked_sub,
|
||||
display_name: sub
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
|
||||
method_name: unchecked_neg,
|
||||
method_name_cpu: unchecked_neg,
|
||||
display_name: neg
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
|
||||
method_name: unchecked_abs,
|
||||
method_name_cpu: unchecked_abs_parallelized,
|
||||
display_name: abs
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_mul,
|
||||
method_name_cpu: unchecked_mul_parallelized,
|
||||
display_name: mul
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_div_rem,
|
||||
method_name_cpu: unchecked_div_rem_parallelized,
|
||||
display_name: div_mod
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_bitand,
|
||||
method_name_cpu: unchecked_bitand_parallelized,
|
||||
display_name: bitand
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_bitor,
|
||||
method_name_cpu: unchecked_bitor_parallelized,
|
||||
display_name: bitor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_bitxor,
|
||||
method_name_cpu: unchecked_bitxor_parallelized,
|
||||
display_name: bitxor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
|
||||
method_name: unchecked_bitnot,
|
||||
method_name_cpu: bitnot,
|
||||
display_name: bitnot
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: unchecked_rotate_left,
|
||||
method_name_cpu: unchecked_rotate_left_parallelized,
|
||||
display_name: rotate_left
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: unchecked_rotate_right,
|
||||
method_name_cpu: unchecked_rotate_right_parallelized,
|
||||
display_name: rotate_right
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: unchecked_left_shift,
|
||||
method_name_cpu: unchecked_left_shift_parallelized,
|
||||
display_name: left_shift
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: unchecked_right_shift,
|
||||
method_name_cpu: unchecked_right_shift_parallelized,
|
||||
display_name: right_shift
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_eq,
|
||||
method_name_cpu: unchecked_eq_parallelized,
|
||||
display_name: eq
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_ne,
|
||||
method_name_cpu: unchecked_ne_parallelized,
|
||||
display_name: ne
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_gt,
|
||||
method_name_cpu: unchecked_gt_parallelized,
|
||||
display_name: gt
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_ge,
|
||||
method_name_cpu: unchecked_ge_parallelized,
|
||||
display_name: ge
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_lt,
|
||||
method_name_cpu: unchecked_lt_parallelized,
|
||||
display_name: lt
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_le,
|
||||
method_name_cpu: unchecked_le_parallelized,
|
||||
display_name: le
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_min,
|
||||
method_name_cpu: unchecked_min_parallelized,
|
||||
display_name: min
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_max,
|
||||
method_name_cpu: unchecked_max_parallelized,
|
||||
display_name: max
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_signed_overflowing_add,
|
||||
method_name_cpu: unchecked_signed_overflowing_add_parallelized,
|
||||
display_name: overflowing_add
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: unchecked_signed_overflowing_sub,
|
||||
method_name_cpu: unchecked_signed_overflowing_sub_parallelized,
|
||||
display_name: overflowing_sub
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_add,
|
||||
method_name_cpu: unchecked_scalar_add,
|
||||
display_name: add,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_mul,
|
||||
method_name_cpu: unchecked_scalar_mul_parallelized,
|
||||
display_name: mul,
|
||||
rng_func: mul_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_sub,
|
||||
method_name_cpu: unchecked_scalar_sub,
|
||||
display_name: sub,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_bitand,
|
||||
method_name_cpu: unchecked_scalar_bitand_parallelized,
|
||||
display_name: bitand,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_bitor,
|
||||
method_name_cpu: unchecked_scalar_bitor_parallelized,
|
||||
display_name: bitor,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_bitxor,
|
||||
method_name_cpu: unchecked_scalar_bitxor_parallelized,
|
||||
display_name: bitxor,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_right_shift,
|
||||
method_name_cpu: unchecked_scalar_right_shift_parallelized,
|
||||
display_name: right_shift,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_left_shift,
|
||||
method_name_cpu: unchecked_scalar_left_shift_parallelized,
|
||||
display_name: left_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_rotate_right,
|
||||
method_name_cpu: unchecked_scalar_rotate_right_parallelized,
|
||||
display_name: rotate_right,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_rotate_left,
|
||||
method_name_cpu: unchecked_scalar_rotate_left_parallelized,
|
||||
display_name: rotate_left,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_eq,
|
||||
method_name_cpu: unchecked_scalar_eq_parallelized,
|
||||
display_name: eq,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_ne,
|
||||
method_name_cpu: unchecked_scalar_ne_parallelized,
|
||||
display_name: ne,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_gt,
|
||||
method_name_cpu: unchecked_scalar_gt_parallelized,
|
||||
display_name: gt,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_ge,
|
||||
method_name_cpu: unchecked_scalar_ge_parallelized,
|
||||
display_name: ge,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_lt,
|
||||
method_name_cpu: unchecked_scalar_lt_parallelized,
|
||||
display_name: lt,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_le,
|
||||
method_name_cpu: unchecked_scalar_le_parallelized,
|
||||
display_name: le,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_min,
|
||||
method_name_cpu: unchecked_scalar_min_parallelized,
|
||||
display_name: min,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_scalar_max,
|
||||
method_name_cpu: unchecked_scalar_max_parallelized,
|
||||
display_name: max,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: signed_overflowing_scalar_add,
|
||||
method_name_cpu: signed_overflowing_scalar_add_parallelized,
|
||||
display_name: overflowing_add,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: signed_overflowing_scalar_sub,
|
||||
method_name_cpu: signed_overflowing_scalar_sub_parallelized,
|
||||
display_name: overflowing_sub,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: unchecked_signed_scalar_div_rem,
|
||||
method_name_cpu: unchecked_signed_scalar_div_rem_parallelized,
|
||||
display_name: div_rem,
|
||||
rng_func: div_scalar
|
||||
);
|
||||
@@ -2247,234 +2396,277 @@ mod cuda {
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: add,
|
||||
method_name_cpu: add_parallelized,
|
||||
display_name: add
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: sub,
|
||||
method_name_cpu: sub_parallelized,
|
||||
display_name: sub
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
|
||||
method_name: neg,
|
||||
method_name_cpu: neg_parallelized,
|
||||
display_name: neg
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
|
||||
method_name: abs,
|
||||
method_name_cpu: abs_parallelized,
|
||||
display_name: abs
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: mul,
|
||||
method_name_cpu: mul_parallelized,
|
||||
display_name: mul
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: div_rem,
|
||||
method_name_cpu: div_rem_parallelized,
|
||||
display_name: div_mod
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: bitand,
|
||||
method_name_cpu: bitand_parallelized,
|
||||
display_name: bitand
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_unary_fn!(
|
||||
method_name: bitnot,
|
||||
method_name_cpu: bitnot,
|
||||
display_name: bitnot
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: bitor,
|
||||
method_name_cpu: bitor_parallelized,
|
||||
display_name: bitor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: bitxor,
|
||||
method_name_cpu: bitxor_parallelized,
|
||||
display_name: bitxor
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: rotate_left,
|
||||
method_name_cpu: rotate_left_parallelized,
|
||||
display_name: rotate_left
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: rotate_right,
|
||||
method_name_cpu: rotate_right_parallelized,
|
||||
display_name: rotate_right
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: left_shift,
|
||||
method_name_cpu: left_shift_parallelized,
|
||||
display_name: left_shift
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_shift_rotate!(
|
||||
method_name: right_shift,
|
||||
method_name_cpu: right_shift_parallelized,
|
||||
display_name: right_shift
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: eq,
|
||||
method_name_cpu: eq_parallelized,
|
||||
display_name: eq
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: ne,
|
||||
method_name_cpu: ne_parallelized,
|
||||
display_name: ne
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: gt,
|
||||
method_name_cpu: gt_parallelized,
|
||||
display_name: gt
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: ge,
|
||||
method_name_cpu: ge_parallelized,
|
||||
display_name: ge
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: lt,
|
||||
method_name_cpu: lt_parallelized,
|
||||
display_name: lt
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: le,
|
||||
method_name_cpu: le_parallelized,
|
||||
display_name: le
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: min,
|
||||
method_name_cpu: min_parallelized,
|
||||
display_name: min
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: max,
|
||||
method_name_cpu: max_parallelized,
|
||||
display_name: max
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: signed_overflowing_add,
|
||||
method_name_cpu: signed_overflowing_add_parallelized,
|
||||
display_name: overflowing_add
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_signed_fn!(
|
||||
method_name: signed_overflowing_sub,
|
||||
method_name_cpu: signed_overflowing_sub_parallelized,
|
||||
display_name: overflowing_sub
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_add,
|
||||
method_name_cpu: scalar_add_parallelized,
|
||||
display_name: add,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_mul,
|
||||
method_name_cpu: scalar_mul_parallelized,
|
||||
display_name: mul,
|
||||
rng_func: mul_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_sub,
|
||||
method_name_cpu: scalar_sub_parallelized,
|
||||
display_name: sub,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_bitand,
|
||||
method_name_cpu: scalar_bitand_parallelized,
|
||||
display_name: bitand,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_bitor,
|
||||
method_name_cpu: scalar_bitor_parallelized,
|
||||
display_name: bitor,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_bitxor,
|
||||
method_name_cpu: scalar_bitxor_parallelized,
|
||||
display_name: bitxor,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_left_shift,
|
||||
method_name_cpu: scalar_left_shift_parallelized,
|
||||
display_name: left_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_right_shift,
|
||||
method_name_cpu: scalar_right_shift_parallelized,
|
||||
display_name: right_shift,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_rotate_left,
|
||||
method_name_cpu: scalar_rotate_left_parallelized,
|
||||
display_name: rotate_left,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_rotate_right,
|
||||
method_name_cpu: scalar_rotate_right_parallelized,
|
||||
display_name: rotate_right,
|
||||
rng_func: shift_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_eq,
|
||||
method_name_cpu: scalar_eq_parallelized,
|
||||
display_name: eq,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_ne,
|
||||
method_name_cpu: scalar_ne_parallelized,
|
||||
display_name: ne,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_gt,
|
||||
method_name_cpu: scalar_gt_parallelized,
|
||||
display_name: gt,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_ge,
|
||||
method_name_cpu: scalar_ge_parallelized,
|
||||
display_name: ge,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_lt,
|
||||
method_name_cpu: scalar_lt_parallelized,
|
||||
display_name: lt,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_le,
|
||||
method_name_cpu: scalar_le_parallelized,
|
||||
display_name: le,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_min,
|
||||
method_name_cpu: scalar_min_parallelized,
|
||||
display_name: min,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: scalar_max,
|
||||
method_name_cpu: scalar_max_parallelized,
|
||||
display_name: max,
|
||||
rng_func: default_signed_scalar
|
||||
);
|
||||
|
||||
define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
|
||||
method_name: signed_scalar_div_rem,
|
||||
method_name_cpu: signed_scalar_div_rem_parallelized,
|
||||
display_name: div_rem,
|
||||
rng_func: div_scalar
|
||||
);
|
||||
@@ -2697,6 +2889,7 @@ use cuda::{
|
||||
cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops,
|
||||
unchecked_cuda_ops, unchecked_scalar_cuda_ops,
|
||||
};
|
||||
use tfhe::{get_pbs_count, reset_pbs_count};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
fn go_through_gpu_bench_groups(val: &str) {
|
||||
|
||||
@@ -5,6 +5,7 @@ use crate::utilities::{throughput_num_threads, BenchmarkType, BENCH_TYPE};
|
||||
use criterion::{criterion_group, Criterion, Throughput};
|
||||
use rand::prelude::*;
|
||||
use rayon::prelude::*;
|
||||
use std::cmp::max;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
@@ -24,6 +25,7 @@ use tfhe::shortint::parameters::key_switching::p_fail_2_minus_64::ks_pbs::{
|
||||
};
|
||||
use tfhe::shortint::parameters::PBSParameters;
|
||||
use tfhe::zk::{CompactPkeCrs, ZkComputeLoad};
|
||||
use tfhe::{get_pbs_count, reset_pbs_count};
|
||||
use utilities::{write_to_json, OperatorType};
|
||||
|
||||
fn write_result(file: &mut File, name: &str, value: usize) {
|
||||
@@ -112,7 +114,17 @@ fn pke_zk_proof(c: &mut Criterion) {
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
let elements = throughput_num_threads(num_block);
|
||||
// Execute the operation once to know its cost.
|
||||
let input_msg = rng.gen::<u64>();
|
||||
let messages = vec![input_msg; fhe_uint_count];
|
||||
|
||||
reset_pbs_count();
|
||||
let _ = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
|
||||
.extend(messages.iter().copied())
|
||||
.build_with_proof_packed(&crs, &metadata, compute_load);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
|
||||
bench_id = format!(
|
||||
@@ -330,7 +342,27 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) {
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
// In throughput mode object sizes are not recorded.
|
||||
let elements = throughput_num_threads(num_block);
|
||||
|
||||
// Execute the operation once to know its cost.
|
||||
let input_msg = rng.gen::<u64>();
|
||||
let messages = vec![input_msg; fhe_uint_count];
|
||||
let ct1 = tfhe::integer::ProvenCompactCiphertextList::builder(&pk)
|
||||
.extend(messages.iter().copied())
|
||||
.build_with_proof_packed(&crs, &metadata, compute_load)
|
||||
.unwrap();
|
||||
|
||||
reset_pbs_count();
|
||||
let _ = ct1.verify_and_expand(
|
||||
&crs,
|
||||
&pk,
|
||||
&metadata,
|
||||
IntegerCompactCiphertextListExpansionMode::CastAndUnpackIfNecessary(
|
||||
casting_key.as_view(),
|
||||
),
|
||||
);
|
||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
||||
|
||||
let elements = throughput_num_threads(num_block, pbs_count);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
|
||||
bench_id_verify = format!(
|
||||
|
||||
@@ -389,29 +389,43 @@ pub mod integer_utils {
|
||||
use super::*;
|
||||
use std::sync::OnceLock;
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe_cuda_backend::cuda_bind::cuda_get_number_of_gpus;
|
||||
use tfhe::core_crypto::gpu::get_number_of_gpus;
|
||||
|
||||
/// Generate a number of threads to use to saturate current machine for throughput measurements.
|
||||
#[allow(dead_code)]
|
||||
pub fn throughput_num_threads(num_block: usize) -> u64 {
|
||||
pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
|
||||
let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
|
||||
let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil();
|
||||
let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0);
|
||||
// Some operations with a high count of PBS (e.g. division) would yield an operation
|
||||
// loading value so low that the number of elements in the end wouldn't be meaningful.
|
||||
let minimum_loading = if num_block < 64 { 0.2 } else { 0.1 };
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
{
|
||||
// This value is for Nvidia H100 GPU
|
||||
let streaming_multiprocessors = 132;
|
||||
let num_gpus = unsafe { cuda_get_number_of_gpus() };
|
||||
((streaming_multiprocessors * num_gpus) as f64 * block_multiplicator) as u64
|
||||
let total_num_sm = streaming_multiprocessors * get_number_of_gpus();
|
||||
let operation_loading =
|
||||
((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
|
||||
(total_num_sm as f64 * block_multiplicator * operation_loading) as u64
|
||||
}
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
{
|
||||
let num_threads = rayon::current_num_threads() as f64;
|
||||
let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);
|
||||
// Add 20% more to maximum threads available.
|
||||
((num_threads + (num_threads * 0.2)) * block_multiplicator) as u64
|
||||
((num_threads + (num_threads * 0.2)) * block_multiplicator.min(1.0) * operation_loading)
|
||||
as u64
|
||||
}
|
||||
}
|
||||
|
||||
/// Get number of streams usable for CUDA throughput benchmarks
|
||||
#[allow(dead_code)]
|
||||
#[cfg(feature = "gpu")]
|
||||
pub fn cuda_num_streams(num_block: usize) -> u64 {
|
||||
((192 / num_block) * get_number_of_gpus() as usize) as u64
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub static BENCH_TYPE: OnceLock<BenchmarkType> = OnceLock::new();
|
||||
|
||||
|
||||
@@ -172,7 +172,7 @@ impl CudaBooleanBlock {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn duplicate(&self, streams: &CudaStreams) -> Self {
|
||||
pub fn duplicate(&self, streams: &CudaStreams) -> Self {
|
||||
let ct = unsafe { self.duplicate_async(streams) };
|
||||
streams.synchronize();
|
||||
ct
|
||||
|
||||
Reference in New Issue
Block a user