mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-08 22:28:01 -05:00
feat(hpu): Add Hpu backend implementation
This backend abstract communication with Hpu Fpga hardware.
It define it's proper entities to prevent circular dependencies with
tfhe-rs.
Object lifetime is handle through Arc<Mutex<T>> wrapper, and enforce
that all objects currently alive in Hpu Hw are also kept valid on the
host side.
It contains the second version of HPU instruction set (HIS_V2.0):
* DOp have following properties:
+ Template as first class citizen
+ Support of Immediate template
+ Direct parser and conversion between Asm/Hex
+ Replace deku (and it's associated endianess limitation) by
+ bitfield_struct and manual parsing
* IOp have following properties:
+ Support various number of Destination
+ Support various number of Sources
+ Support various number of Immediat values
+ Support of multiple bitwidth (Not implemented yet in the Fpga
firmware)
Details could be view in `backends/tfhe-hpu-backend/Readme.md`
This commit is contained in:
@@ -35,6 +35,8 @@ boolean = ["tfhe/boolean"]
|
||||
shortint = ["tfhe/shortint"]
|
||||
integer = ["shortint", "tfhe/integer"]
|
||||
gpu = ["tfhe/gpu"]
|
||||
hpu = ["tfhe/hpu"]
|
||||
hpu-v80 = ["tfhe/hpu-v80"]
|
||||
internal-keycache = ["tfhe/internal-keycache"]
|
||||
nightly-avx512 = ["tfhe/nightly-avx512"]
|
||||
pbs-stats = ["tfhe/pbs-stats"]
|
||||
|
||||
@@ -726,7 +726,11 @@ fn mem_optimized_pbs_ntt(c: &mut Criterion) {
|
||||
bsk.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
par_convert_standard_lwe_bootstrap_key_to_ntt64(&bsk, &mut nbsk);
|
||||
par_convert_standard_lwe_bootstrap_key_to_ntt64(
|
||||
&bsk,
|
||||
&mut nbsk,
|
||||
NttLweBootstrapKeyOption::Normalize,
|
||||
);
|
||||
|
||||
drop(bsk);
|
||||
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use criterion::{black_box, Criterion};
|
||||
use rand::prelude::*;
|
||||
use std::fmt::Write;
|
||||
use std::ops::*;
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::{
|
||||
set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheUint10, FheUint12,
|
||||
FheUint128, FheUint14, FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
|
||||
ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
|
||||
FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
|
||||
};
|
||||
|
||||
fn bench_fhe_type<FheType>(c: &mut Criterion, client_key: &ClientKey, type_name: &str)
|
||||
where
|
||||
FheType: FheEncrypt<u128, ClientKey>,
|
||||
FheType: FheWait,
|
||||
for<'a> &'a FheType: Add<&'a FheType, Output = FheType>
|
||||
+ Sub<&'a FheType, Output = FheType>
|
||||
+ Mul<&'a FheType, Output = FheType>
|
||||
@@ -35,54 +35,133 @@ where
|
||||
let mut name = String::with_capacity(255);
|
||||
|
||||
write!(name, "add({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs + &rhs)));
|
||||
name.clear();
|
||||
|
||||
write!(name, "overflowing_add({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| black_box((&lhs).overflowing_add(&rhs)))
|
||||
b.iter(|| {
|
||||
let res = &lhs + &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "overflowing_sub({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(lhs.overflowing_sub(&rhs))));
|
||||
name.clear();
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
{
|
||||
write!(name, "overflowing_add({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let (res, flag) = lhs.overflowing_add(&rhs);
|
||||
res.wait();
|
||||
black_box((res, flag))
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
{
|
||||
write!(name, "overflowing_sub({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let (res, flag) = lhs.overflowing_sub(&rhs);
|
||||
res.wait();
|
||||
black_box((res, flag))
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
}
|
||||
|
||||
write!(name, "sub({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs - &rhs)));
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs - &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "mul({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs * &rhs)));
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs * &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "bitand({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs & &rhs)));
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs & &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "bitor({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs | &rhs)));
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs | &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "bitxor({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs ^ &rhs)));
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs ^ &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "shl({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs << &rhs)));
|
||||
name.clear();
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
{
|
||||
write!(name, "shl({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs << &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "shr({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs >> &rhs)));
|
||||
name.clear();
|
||||
write!(name, "shr({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = &lhs >> &rhs;
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "rotl({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_left(&rhs))));
|
||||
name.clear();
|
||||
write!(name, "rotl({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = (&lhs).rotate_left(&rhs);
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
|
||||
write!(name, "rotr({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_right(&rhs))));
|
||||
name.clear();
|
||||
write!(name, "rotr({type_name}, {type_name})").unwrap();
|
||||
bench_group.bench_function(&name, |b| {
|
||||
b.iter(|| {
|
||||
let res = (&lhs).rotate_right(&rhs);
|
||||
res.wait();
|
||||
black_box(res)
|
||||
})
|
||||
});
|
||||
name.clear();
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! bench_type {
|
||||
@@ -108,13 +187,39 @@ bench_type!(FheUint64);
|
||||
bench_type!(FheUint128);
|
||||
|
||||
fn main() {
|
||||
let config =
|
||||
ConfigBuilder::with_custom_parameters(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
|
||||
.build();
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
#[cfg(feature = "hpu")]
|
||||
let cks = {
|
||||
// Hpu is enable, start benchmark on Hpu hw accelerator
|
||||
use tfhe::tfhe_hpu_backend::prelude::*;
|
||||
use tfhe::{set_server_key, Config};
|
||||
|
||||
set_server_key(compressed_sks.decompress());
|
||||
// Use environment variable to construct path to configuration file
|
||||
let config_path = ShellString::new(
|
||||
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(),
|
||||
);
|
||||
let hpu_device = HpuDevice::from_config(&config_path.expand());
|
||||
|
||||
let config = Config::from_hpu_device(&hpu_device);
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
|
||||
set_server_key((hpu_device, compressed_sks));
|
||||
cks
|
||||
};
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
let cks = {
|
||||
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use tfhe::{set_server_key, ConfigBuilder};
|
||||
let config = ConfigBuilder::with_custom_parameters(
|
||||
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.build();
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
|
||||
set_server_key(compressed_sks.decompress());
|
||||
cks
|
||||
};
|
||||
|
||||
let mut c = Criterion::default().configure_from_args();
|
||||
|
||||
|
||||
@@ -1,21 +1,22 @@
|
||||
#[cfg(feature = "gpu")]
|
||||
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
#[cfg(feature = "gpu")]
|
||||
use benchmark::utilities::configure_gpu;
|
||||
use benchmark::utilities::{write_to_json, OperatorType};
|
||||
use criterion::measurement::WallTime;
|
||||
use criterion::{BenchmarkGroup, Criterion, Throughput};
|
||||
use rand::prelude::*;
|
||||
use rand::thread_rng;
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
use rayon::prelude::*;
|
||||
use std::ops::{Add, Mul, Sub};
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
use std::ops::Mul;
|
||||
use std::ops::{Add, Sub};
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::core_crypto::gpu::get_number_of_gpus;
|
||||
use tfhe::keycache::NamedParam;
|
||||
use tfhe::prelude::*;
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::GpuIndex;
|
||||
use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBool, FheUint64};
|
||||
use tfhe::{set_server_key, ClientKey, CompressedServerKey, FheBool, FheUint64};
|
||||
|
||||
/// Transfer as written in the original FHEvm white-paper,
|
||||
/// it uses a comparison to check if the sender has enough,
|
||||
@@ -25,6 +26,28 @@ pub fn transfer_whitepaper<FheType>(
|
||||
to_amount: &FheType,
|
||||
amount: &FheType,
|
||||
) -> (FheType, FheType)
|
||||
where
|
||||
FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
|
||||
FheBool: IfThenElse<FheType>,
|
||||
for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
|
||||
{
|
||||
let has_enough_funds = (from_amount).ge(amount);
|
||||
|
||||
let mut new_to_amount = to_amount + amount;
|
||||
new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount);
|
||||
|
||||
let mut new_from_amount = from_amount - amount;
|
||||
new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount);
|
||||
|
||||
(new_from_amount, new_to_amount)
|
||||
}
|
||||
|
||||
/// Parallel variant of [`transfer_whitepaper`].
|
||||
pub fn par_transfer_whitepaper<FheType>(
|
||||
from_amount: &FheType,
|
||||
to_amount: &FheType,
|
||||
amount: &FheType,
|
||||
) -> (FheType, FheType)
|
||||
where
|
||||
FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Send + Sync,
|
||||
FheBool: IfThenElse<FheType>,
|
||||
@@ -48,6 +71,7 @@ where
|
||||
|
||||
/// This one also uses a comparison, but it leverages the 'boolean' multiplication
|
||||
/// instead of cmuxes, so it is faster
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
fn transfer_no_cmux<FheType>(
|
||||
from_amount: &FheType,
|
||||
to_amount: &FheType,
|
||||
@@ -71,6 +95,7 @@ where
|
||||
|
||||
/// This one uses overflowing sub to remove the need for comparison
|
||||
/// it also uses the 'boolean' multiplication
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
fn transfer_overflow<FheType>(
|
||||
from_amount: &FheType,
|
||||
to_amount: &FheType,
|
||||
@@ -97,6 +122,7 @@ where
|
||||
|
||||
/// This ones uses both overflowing_add/sub to check that both
|
||||
/// the sender has enough funds, and the receiver will not overflow its balance
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
fn transfer_safe<FheType>(
|
||||
from_amount: &FheType,
|
||||
to_amount: &FheType,
|
||||
@@ -123,7 +149,30 @@ where
|
||||
(new_from_amount, new_to_amount)
|
||||
}
|
||||
|
||||
#[cfg(feature = "pbs-stats")]
|
||||
#[cfg(feature = "hpu")]
|
||||
/// This one use a dedicated IOp inside Hpu
|
||||
fn transfer_hpu<FheType>(
|
||||
from_amount: &FheType,
|
||||
to_amount: &FheType,
|
||||
amount: &FheType,
|
||||
) -> (FheType, FheType)
|
||||
where
|
||||
FheType: FheHpu,
|
||||
{
|
||||
use tfhe::tfhe_hpu_backend::prelude::hpu_asm;
|
||||
let src = HpuHandle {
|
||||
native: vec![from_amount, to_amount, amount],
|
||||
boolean: vec![],
|
||||
imm: vec![],
|
||||
};
|
||||
let mut res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_20, src);
|
||||
// Iop erc_20 return new_from, new_to
|
||||
let new_to = res_handle.native.pop().unwrap();
|
||||
let new_from = res_handle.native.pop().unwrap();
|
||||
(new_from, new_to)
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "pbs-stats", not(feature = "hpu")))]
|
||||
mod pbs_stats {
|
||||
use super::*;
|
||||
use std::fs::{File, OpenOptions};
|
||||
@@ -200,6 +249,7 @@ fn bench_transfer_latency<FheType, F>(
|
||||
transfer_func: F,
|
||||
) where
|
||||
FheType: FheEncrypt<u64, ClientKey>,
|
||||
FheType: FheWait,
|
||||
F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType),
|
||||
{
|
||||
#[cfg(feature = "gpu")]
|
||||
@@ -214,7 +264,11 @@ fn bench_transfer_latency<FheType, F>(
|
||||
let amount = FheType::encrypt(rng.gen::<u64>(), client_key);
|
||||
|
||||
b.iter(|| {
|
||||
let (_, _) = transfer_func(&from_amount, &to_amount, &amount);
|
||||
let (new_from, new_to) = transfer_func(&from_amount, &to_amount, &amount);
|
||||
new_from.wait();
|
||||
criterion::black_box(new_from);
|
||||
new_to.wait();
|
||||
criterion::black_box(new_to);
|
||||
})
|
||||
});
|
||||
|
||||
@@ -231,7 +285,7 @@ fn bench_transfer_latency<FheType, F>(
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
|
||||
fn bench_transfer_throughput<FheType, F>(
|
||||
group: &mut BenchmarkGroup<'_, WallTime>,
|
||||
client_key: &ClientKey,
|
||||
@@ -283,6 +337,7 @@ fn bench_transfer_throughput<FheType, F>(
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
fn cuda_bench_transfer_throughput<FheType, F>(
|
||||
group: &mut BenchmarkGroup<'_, WallTime>,
|
||||
@@ -370,16 +425,75 @@ fn cuda_bench_transfer_throughput<FheType, F>(
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "pbs-stats")]
|
||||
use pbs_stats::print_transfer_pbs_counts;
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::core_crypto::gpu::get_number_of_gpus;
|
||||
#[cfg(feature = "hpu")]
|
||||
fn hpu_bench_transfer_throughput<FheType, F>(
|
||||
group: &mut BenchmarkGroup<'_, WallTime>,
|
||||
client_key: &ClientKey,
|
||||
bench_name: &str,
|
||||
type_name: &str,
|
||||
fn_name: &str,
|
||||
transfer_func: F,
|
||||
) where
|
||||
FheType: FheEncrypt<u64, ClientKey> + Send + Sync,
|
||||
FheType: FheWait,
|
||||
F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType) + Sync,
|
||||
{
|
||||
let mut rng = thread_rng();
|
||||
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
for num_elems in [10, 100] {
|
||||
group.throughput(Throughput::Elements(num_elems));
|
||||
let bench_id =
|
||||
format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems");
|
||||
group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
|
||||
let from_amounts = (0..num_elems)
|
||||
.map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
|
||||
.collect::<Vec<_>>();
|
||||
let to_amounts = (0..num_elems)
|
||||
.map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
|
||||
.collect::<Vec<_>>();
|
||||
let amounts = (0..num_elems)
|
||||
.map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
b.iter(|| {
|
||||
let (last_new_from, last_new_to) = std::iter::zip(
|
||||
from_amounts.iter(),
|
||||
std::iter::zip(to_amounts.iter(), amounts.iter()),
|
||||
)
|
||||
.map(|(from_amount, (to_amount, amount))| {
|
||||
transfer_func(from_amount, to_amount, amount)
|
||||
})
|
||||
.last()
|
||||
.unwrap();
|
||||
|
||||
// Wait on last result to enforce all computation is over
|
||||
last_new_from.wait();
|
||||
criterion::black_box(last_new_from);
|
||||
last_new_to.wait();
|
||||
criterion::black_box(last_new_to);
|
||||
});
|
||||
});
|
||||
|
||||
let params = client_key.computation_parameters();
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
params,
|
||||
params.name(),
|
||||
"erc20-transfer",
|
||||
&OperatorType::Atomic,
|
||||
64,
|
||||
vec![],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
|
||||
fn main() {
|
||||
let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use crate::pbs_stats::print_transfer_pbs_counts;
|
||||
let params = benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
let config = ConfigBuilder::with_custom_parameters(params).build();
|
||||
let config = tfhe::ConfigBuilder::with_custom_parameters(params).build();
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
|
||||
@@ -401,7 +515,7 @@ fn main() {
|
||||
&cks,
|
||||
"FheUint64",
|
||||
"transfer::whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
par_transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
|
||||
print_transfer_pbs_counts(
|
||||
@@ -422,7 +536,7 @@ fn main() {
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"transfer::whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
par_transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
bench_transfer_latency(
|
||||
&mut group,
|
||||
@@ -461,7 +575,7 @@ fn main() {
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"transfer::whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
par_transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
bench_transfer_throughput(
|
||||
&mut group,
|
||||
@@ -496,9 +610,10 @@ fn main() {
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
fn main() {
|
||||
let params = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use crate::pbs_stats::print_transfer_pbs_counts;
|
||||
let params = benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
let config = ConfigBuilder::with_custom_parameters(params).build();
|
||||
let config = tfhe::ConfigBuilder::with_custom_parameters(params).build();
|
||||
let cks = ClientKey::generate(config);
|
||||
|
||||
let mut c = Criterion::default().sample_size(10).configure_from_args();
|
||||
@@ -514,7 +629,7 @@ fn main() {
|
||||
&cks,
|
||||
"FheUint64",
|
||||
"transfer::whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
par_transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
|
||||
print_transfer_pbs_counts(
|
||||
@@ -535,7 +650,7 @@ fn main() {
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"transfer::whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
par_transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
bench_transfer_latency(
|
||||
&mut group,
|
||||
@@ -574,7 +689,7 @@ fn main() {
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"transfer::whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
par_transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
cuda_bench_transfer_throughput(
|
||||
&mut group,
|
||||
@@ -605,3 +720,76 @@ fn main() {
|
||||
|
||||
c.final_summary();
|
||||
}
|
||||
#[cfg(feature = "hpu")]
|
||||
fn main() {
|
||||
let cks = {
|
||||
// Hpu is enable, start benchmark on Hpu hw accelerator
|
||||
use tfhe::tfhe_hpu_backend::prelude::*;
|
||||
use tfhe::Config;
|
||||
|
||||
// Use environment variable to construct path to configuration file
|
||||
let config_path = ShellString::new(
|
||||
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(),
|
||||
);
|
||||
let hpu_device = HpuDevice::from_config(&config_path.expand());
|
||||
|
||||
let config = Config::from_hpu_device(&hpu_device);
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
|
||||
set_server_key((hpu_device, compressed_sks));
|
||||
cks
|
||||
};
|
||||
|
||||
let mut c = Criterion::default().sample_size(10).configure_from_args();
|
||||
|
||||
let bench_name = "hlapi::hpu::erc20::transfer";
|
||||
|
||||
// FheUint64 latency
|
||||
{
|
||||
let mut group = c.benchmark_group(bench_name);
|
||||
bench_transfer_latency(
|
||||
&mut group,
|
||||
&cks,
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
// Erc20 optimized instruction only available on Hpu
|
||||
bench_transfer_latency(
|
||||
&mut group,
|
||||
&cks,
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"hpu_optim",
|
||||
transfer_hpu::<FheUint64>,
|
||||
);
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// FheUint64 Throughput
|
||||
{
|
||||
let mut group = c.benchmark_group(bench_name);
|
||||
hpu_bench_transfer_throughput(
|
||||
&mut group,
|
||||
&cks,
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"whitepaper",
|
||||
transfer_whitepaper::<FheUint64>,
|
||||
);
|
||||
// Erc20 optimized instruction only available on Hpu
|
||||
hpu_bench_transfer_throughput(
|
||||
&mut group,
|
||||
&cks,
|
||||
bench_name,
|
||||
"FheUint64",
|
||||
"hpu_optim",
|
||||
transfer_hpu::<FheUint64>,
|
||||
);
|
||||
group.finish();
|
||||
}
|
||||
|
||||
c.final_summary();
|
||||
}
|
||||
|
||||
@@ -2931,6 +2931,323 @@ use cuda::{
|
||||
unchecked_cuda_ops, unchecked_scalar_cuda_ops,
|
||||
};
|
||||
|
||||
#[cfg(feature = "hpu")]
|
||||
mod hpu {
|
||||
use super::*;
|
||||
use criterion::{black_box, criterion_group};
|
||||
use tfhe::integer::hpu::ciphertext::HpuRadixCiphertext;
|
||||
use tfhe::prelude::CastFrom;
|
||||
use tfhe::tfhe_hpu_backend::prelude::*;
|
||||
|
||||
/// Base function to bench an hpu operations.
|
||||
/// Inputs/Output types and length are inferred based on associated iop prototype
|
||||
fn bench_hpu_iop_clean_inputs(
|
||||
c: &mut Criterion,
|
||||
bench_name: &str,
|
||||
display_name: &str,
|
||||
iop: &hpu_asm::AsmIOpcode,
|
||||
) {
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(60));
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
|
||||
if bit_size > ScalarType::BITS as usize {
|
||||
break;
|
||||
}
|
||||
let param_name = param.name();
|
||||
|
||||
let max_value_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size);
|
||||
|
||||
let bench_id;
|
||||
|
||||
let proto = if let Some(format) = iop.format() {
|
||||
format.proto.clone()
|
||||
} else {
|
||||
panic!("HPU only IOp with defined prototype could be benched");
|
||||
};
|
||||
|
||||
match get_bench_type() {
|
||||
BenchmarkType::Latency => {
|
||||
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let hpu_device_mutex = KEY_CACHE.get_hpu_device(param);
|
||||
let hpu_device = hpu_device_mutex.lock().unwrap();
|
||||
|
||||
let gen_inputs = || {
|
||||
let srcs = proto
|
||||
.src
|
||||
.iter()
|
||||
.map(|mode| {
|
||||
let (bw, block) = match mode {
|
||||
hpu_asm::iop::VarMode::Native => (bit_size, num_block),
|
||||
hpu_asm::iop::VarMode::Half => {
|
||||
(bit_size / 2, num_block / 2)
|
||||
}
|
||||
hpu_asm::iop::VarMode::Bool => (1, 1),
|
||||
};
|
||||
|
||||
let clear = rng
|
||||
.gen_range(0..u128::cast_from(max_value_for_bit_size))
|
||||
& if bw < u128::BITS as usize {
|
||||
(1_u128 << bw) - 1
|
||||
} else {
|
||||
!0_u128
|
||||
};
|
||||
let fhe = cks.encrypt_radix(clear, block);
|
||||
HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let imms = (0..proto.imm)
|
||||
.map(|_| rng.gen_range(0..u128::cast_from(max_value_for_bit_size)))
|
||||
.collect::<Vec<_>>();
|
||||
(srcs, imms)
|
||||
};
|
||||
|
||||
b.iter_batched(
|
||||
gen_inputs,
|
||||
|(srcs, imms)| {
|
||||
let res =
|
||||
HpuRadixCiphertext::exec(&proto, iop.opcode(), &srcs, &imms);
|
||||
res.into_iter().for_each(|ct| {
|
||||
ct.wait();
|
||||
black_box(ct);
|
||||
});
|
||||
},
|
||||
criterion::BatchSize::SmallInput,
|
||||
)
|
||||
});
|
||||
}
|
||||
BenchmarkType::Throughput => {
|
||||
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
|
||||
bench_group
|
||||
.sample_size(10)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
let elements = throughput_num_threads(num_block, 1);
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let hpu_device_mutex = KEY_CACHE.get_hpu_device(param);
|
||||
let hpu_device = hpu_device_mutex.lock().unwrap();
|
||||
|
||||
let inputs = (0..elements)
|
||||
.map(|_| {
|
||||
let srcs = proto
|
||||
.src
|
||||
.iter()
|
||||
.map(|mode| {
|
||||
let (bw, block) = match mode {
|
||||
hpu_asm::iop::VarMode::Native => (bit_size, num_block),
|
||||
hpu_asm::iop::VarMode::Half => {
|
||||
(bit_size / 2, num_block / 2)
|
||||
}
|
||||
hpu_asm::iop::VarMode::Bool => (1, 1),
|
||||
};
|
||||
|
||||
let clear = rng
|
||||
.gen_range(0..u128::cast_from(max_value_for_bit_size))
|
||||
& if bw < u128::BITS as usize {
|
||||
(1_u128 << bw) - 1
|
||||
} else {
|
||||
!0_u128
|
||||
};
|
||||
let fhe = cks.encrypt_radix(clear, block);
|
||||
HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let imms = (0..proto.imm)
|
||||
.map(|_| {
|
||||
rng.gen_range(0..u128::cast_from(max_value_for_bit_size))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
(srcs, imms)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
b.iter(|| {
|
||||
let last_res = inputs
|
||||
.iter()
|
||||
.map(|input| {
|
||||
HpuRadixCiphertext::exec(
|
||||
&proto,
|
||||
iop.opcode(),
|
||||
&input.0,
|
||||
&input.1,
|
||||
)
|
||||
})
|
||||
.next_back()
|
||||
.unwrap();
|
||||
last_res.into_iter().for_each(|ct| {
|
||||
ct.wait();
|
||||
black_box(ct);
|
||||
});
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
param,
|
||||
param.name(),
|
||||
display_name,
|
||||
&OperatorType::Atomic,
|
||||
bit_size as u32,
|
||||
vec![param.message_modulus().0.ilog2(); num_block],
|
||||
);
|
||||
}
|
||||
|
||||
bench_group.finish()
|
||||
}
|
||||
|
||||
macro_rules! define_hpu_bench_default_fn (
|
||||
(iop_name: $iop:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
|
||||
bench_hpu_iop_clean_inputs(
|
||||
c,
|
||||
concat!("integer::hpu::", stringify!($iop)),
|
||||
stringify!($name),
|
||||
&hpu_asm::iop::[< IOP_ $iop:upper >],
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
macro_rules! define_hpu_bench_default_fn_scalar (
|
||||
(iop_name: $iop:ident, display_name:$name:ident) => {
|
||||
::paste::paste!{
|
||||
fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
|
||||
bench_hpu_iop_clean_inputs(
|
||||
c,
|
||||
concat!("integer::hpu::scalar::", stringify!($iop)),
|
||||
stringify!($name),
|
||||
&hpu_asm::iop::[< IOP_ $iop:upper >],
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Alu ------------------------------------------------------------------------
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: add,
|
||||
display_name: add
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: sub,
|
||||
display_name: sub
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: mul,
|
||||
display_name: mul
|
||||
);
|
||||
criterion_group!(
|
||||
default_hpu_ops,
|
||||
default_hpu_add,
|
||||
default_hpu_sub,
|
||||
default_hpu_mul
|
||||
);
|
||||
|
||||
// Alu Scalar -----------------------------------------------------------------
|
||||
define_hpu_bench_default_fn_scalar!(
|
||||
iop_name: adds,
|
||||
display_name: add
|
||||
);
|
||||
define_hpu_bench_default_fn_scalar!(
|
||||
iop_name: subs,
|
||||
display_name: sub
|
||||
);
|
||||
//define_hpu_bench_default_fn!(
|
||||
// iop_name: ssub,
|
||||
// display_name: scalar_sub
|
||||
//);
|
||||
define_hpu_bench_default_fn_scalar!(
|
||||
iop_name: muls,
|
||||
display_name: mul
|
||||
);
|
||||
criterion_group!(
|
||||
default_hpu_ops_scalar,
|
||||
default_hpu_adds,
|
||||
default_hpu_subs,
|
||||
//default_hpu_ssub,
|
||||
default_hpu_muls
|
||||
);
|
||||
// Bitwise --------------------------------------------------------------------
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: bw_and,
|
||||
display_name: bitand
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: bw_or,
|
||||
display_name: bitor
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: bw_xor,
|
||||
display_name: bitxor
|
||||
);
|
||||
criterion_group!(
|
||||
default_hpu_bitwise,
|
||||
default_hpu_bw_and,
|
||||
default_hpu_bw_or,
|
||||
default_hpu_bw_xor,
|
||||
);
|
||||
// Comparison ----------------------------------------------------------------
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: cmp_eq,
|
||||
display_name: equal
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: cmp_neq,
|
||||
display_name: not_equal
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: cmp_gt,
|
||||
display_name: greater_than
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: cmp_gte,
|
||||
display_name: greater_or_equal
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: cmp_lt,
|
||||
display_name: lower_than
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: cmp_lte,
|
||||
display_name: lower_or_equal
|
||||
);
|
||||
criterion_group!(
|
||||
default_hpu_cmp,
|
||||
default_hpu_cmp_eq,
|
||||
default_hpu_cmp_neq,
|
||||
default_hpu_cmp_gt,
|
||||
default_hpu_cmp_gte,
|
||||
default_hpu_cmp_lt,
|
||||
default_hpu_cmp_lte,
|
||||
);
|
||||
// Ternary --------------------------------------------------------------------
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: if_then_else,
|
||||
display_name: if_then_else
|
||||
);
|
||||
define_hpu_bench_default_fn!(
|
||||
iop_name: if_then_zero,
|
||||
display_name: if_then_zero
|
||||
);
|
||||
criterion_group!(
|
||||
default_hpu_select,
|
||||
default_hpu_if_then_else,
|
||||
default_hpu_if_then_zero,
|
||||
);
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
smart_ops,
|
||||
smart_neg,
|
||||
@@ -3297,6 +3614,23 @@ fn go_through_gpu_bench_groups(val: &str) {
|
||||
};
|
||||
}
|
||||
|
||||
#[cfg(feature = "hpu")]
|
||||
fn go_through_hpu_bench_groups(val: &str) {
|
||||
match val.to_lowercase().as_str() {
|
||||
"default" => {
|
||||
hpu::default_hpu_ops();
|
||||
hpu::default_hpu_ops_scalar();
|
||||
hpu::default_hpu_bitwise();
|
||||
hpu::default_hpu_cmp();
|
||||
hpu::default_hpu_select();
|
||||
}
|
||||
"fast_default" => {
|
||||
hpu::default_hpu_ops();
|
||||
}
|
||||
_ => panic!("unknown benchmark operations flavor"),
|
||||
};
|
||||
}
|
||||
|
||||
fn go_through_cpu_bench_groups(val: &str) {
|
||||
match val.to_lowercase().as_str() {
|
||||
"default" => {
|
||||
@@ -3336,7 +3670,9 @@ fn main() {
|
||||
Ok(val) => {
|
||||
#[cfg(feature = "gpu")]
|
||||
go_through_gpu_bench_groups(&val);
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
#[cfg(feature = "hpu")]
|
||||
go_through_hpu_bench_groups(&val);
|
||||
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
|
||||
go_through_cpu_bench_groups(&val);
|
||||
}
|
||||
Err(_) => {
|
||||
|
||||
@@ -33,7 +33,8 @@ pub mod shortint_params {
|
||||
use tfhe::core_crypto::prelude::{DynamicDistribution, LweBskGroupingFactor};
|
||||
use tfhe::keycache::NamedParam;
|
||||
use tfhe::shortint::{
|
||||
CarryModulus, ClassicPBSParameters, MessageModulus, MultiBitPBSParameters, PBSParameters,
|
||||
AtomicPatternParameters, CarryModulus, ClassicPBSParameters, MessageModulus,
|
||||
MultiBitPBSParameters,
|
||||
};
|
||||
|
||||
pub const SHORTINT_BENCH_PARAMS_TUNIFORM: [ClassicPBSParameters; 4] = [
|
||||
@@ -78,7 +79,7 @@ pub mod shortint_params {
|
||||
.map(|params| {
|
||||
(
|
||||
params.name(),
|
||||
<ClassicPBSParameters as Into<PBSParameters>>::into(*params)
|
||||
<ClassicPBSParameters as Into<AtomicPatternParameters>>::into(*params)
|
||||
.to_owned()
|
||||
.into(),
|
||||
)
|
||||
@@ -94,7 +95,7 @@ pub mod shortint_params {
|
||||
.map(|(params, name)| {
|
||||
(
|
||||
name.to_string(),
|
||||
<ClassicPBSParameters as Into<PBSParameters>>::into(*params)
|
||||
<ClassicPBSParameters as Into<AtomicPatternParameters>>::into(*params)
|
||||
.to_owned()
|
||||
.into(),
|
||||
)
|
||||
@@ -111,7 +112,7 @@ pub mod shortint_params {
|
||||
.map(|params| {
|
||||
(
|
||||
params.name(),
|
||||
<MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
|
||||
<MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
|
||||
.to_owned()
|
||||
.into(),
|
||||
)
|
||||
@@ -132,7 +133,7 @@ pub mod shortint_params {
|
||||
.map(|(params, name)| {
|
||||
(
|
||||
name.to_string(),
|
||||
<MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
|
||||
<MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
|
||||
.to_owned()
|
||||
.into(),
|
||||
)
|
||||
@@ -150,7 +151,7 @@ pub mod shortint_params {
|
||||
.map(|params| {
|
||||
(
|
||||
params.name(),
|
||||
<MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
|
||||
<MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
|
||||
.to_owned()
|
||||
.into(),
|
||||
params.grouping_factor,
|
||||
@@ -172,7 +173,7 @@ pub mod shortint_params {
|
||||
.map(|(params, name)| {
|
||||
(
|
||||
name.to_string(),
|
||||
<MultiBitPBSParameters as Into<PBSParameters>>::into(*params)
|
||||
<MultiBitPBSParameters as Into<AtomicPatternParameters>>::into(*params)
|
||||
.to_owned()
|
||||
.into(),
|
||||
params.grouping_factor,
|
||||
@@ -183,7 +184,7 @@ pub mod shortint_params {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn raw_benchmark_parameters() -> Vec<PBSParameters> {
|
||||
pub fn raw_benchmark_parameters() -> Vec<AtomicPatternParameters> {
|
||||
let is_multi_bit = match env::var("__TFHE_RS_PARAM_TYPE") {
|
||||
Ok(val) => val.to_lowercase() == "multi_bit",
|
||||
Err(_) => false,
|
||||
@@ -351,7 +352,7 @@ pub mod shortint_params {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn filter_parameters<'a, P: Copy + Into<PBSParameters>>(
|
||||
pub fn filter_parameters<'a, P: Copy + Into<AtomicPatternParameters>>(
|
||||
params: &[(&'a P, &'a str)],
|
||||
desired_noise_distribution: DesiredNoiseDistribution,
|
||||
desired_backend: DesiredBackend,
|
||||
@@ -359,7 +360,7 @@ pub mod shortint_params {
|
||||
params
|
||||
.iter()
|
||||
.filter_map(|(p, name)| {
|
||||
let temp_param: PBSParameters = (**p).into();
|
||||
let temp_param: AtomicPatternParameters = (**p).into();
|
||||
|
||||
match (
|
||||
temp_param.lwe_noise_distribution(),
|
||||
@@ -391,13 +392,14 @@ mod integer_params {
|
||||
use crate::utilities::EnvConfig;
|
||||
use itertools::iproduct;
|
||||
use std::vec::IntoIter;
|
||||
use tfhe::shortint::PBSParameters;
|
||||
use tfhe::shortint::AtomicPatternParameters;
|
||||
|
||||
/// An iterator that yields a succession of combinations
|
||||
/// of parameters and a num_block to achieve a certain bit_size ciphertext
|
||||
/// in radix decomposition
|
||||
pub struct ParamsAndNumBlocksIter {
|
||||
params_and_bit_sizes: itertools::Product<IntoIter<PBSParameters>, IntoIter<usize>>,
|
||||
params_and_bit_sizes:
|
||||
itertools::Product<IntoIter<AtomicPatternParameters>, IntoIter<usize>>,
|
||||
}
|
||||
|
||||
impl Default for ParamsAndNumBlocksIter {
|
||||
@@ -405,23 +407,33 @@ mod integer_params {
|
||||
let env_config = EnvConfig::new();
|
||||
|
||||
if env_config.is_multi_bit {
|
||||
#[cfg(feature = "gpu")]
|
||||
let params = vec![
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
|
||||
.into(),
|
||||
];
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
let params = vec![
|
||||
BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(),
|
||||
];
|
||||
#[cfg(feature = "hpu")]
|
||||
panic!("Hpu doesn't implement MultiBit");
|
||||
|
||||
let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
|
||||
Self {
|
||||
params_and_bit_sizes,
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
{
|
||||
#[cfg(feature = "gpu")]
|
||||
let params = vec![
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
|
||||
.into(),
|
||||
];
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
let params = vec![
|
||||
BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128
|
||||
.into(),
|
||||
];
|
||||
|
||||
let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
|
||||
Self {
|
||||
params_and_bit_sizes,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// FIXME One set of parameter is tested since we want to benchmark only quickest
|
||||
// operations.
|
||||
#[cfg(feature = "hpu")]
|
||||
let params = vec![BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64.into()];
|
||||
#[cfg(not(feature = "hpu"))]
|
||||
let params = vec![BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into()];
|
||||
|
||||
let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
|
||||
@@ -433,7 +445,7 @@ mod integer_params {
|
||||
}
|
||||
|
||||
impl Iterator for ParamsAndNumBlocksIter {
|
||||
type Item = (PBSParameters, usize, usize);
|
||||
type Item = (AtomicPatternParameters, usize, usize);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let (param, bit_size) = self.params_and_bit_sizes.next()?;
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#[cfg(any(feature = "shortint", feature = "integer"))]
|
||||
pub mod shortint_params_aliases {
|
||||
use tfhe::shortint::parameters::current_params::*;
|
||||
#[cfg(feature = "hpu")]
|
||||
use tfhe::shortint::parameters::KeySwitch32PBSParameters;
|
||||
use tfhe::shortint::parameters::{
|
||||
ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
|
||||
MultiBitPBSParameters, NoiseSquashingParameters, ShortintKeySwitchingParameters,
|
||||
@@ -136,6 +138,15 @@ pub mod shortint_params_aliases {
|
||||
pub const BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingParameters =
|
||||
V1_2_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
#[cfg(feature = "hpu")]
|
||||
// KS PBS Gaussian for Hpu
|
||||
pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters =
|
||||
V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64;
|
||||
#[cfg(feature = "hpu")]
|
||||
// KS PBS TUniform
|
||||
pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64: KeySwitch32PBSParameters =
|
||||
V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64;
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "shortint", feature = "integer"))]
|
||||
|
||||
@@ -312,6 +312,7 @@ pub fn write_to_json<
|
||||
|
||||
const FAST_BENCH_BIT_SIZES: [usize; 1] = [64];
|
||||
const BENCH_BIT_SIZES: [usize; 8] = [4, 8, 16, 32, 40, 64, 128, 256];
|
||||
const HPU_BENCH_BIT_SIZES: [usize; 5] = [8, 16, 32, 64, 128];
|
||||
const MULTI_BIT_CPU_SIZES: [usize; 6] = [4, 8, 16, 32, 40, 64];
|
||||
|
||||
/// User configuration in which benchmarks must be run.
|
||||
@@ -349,6 +350,8 @@ impl EnvConfig {
|
||||
} else {
|
||||
MULTI_BIT_CPU_SIZES.to_vec()
|
||||
}
|
||||
} else if cfg!(feature = "hpu") {
|
||||
HPU_BENCH_BIT_SIZES.to_vec()
|
||||
} else {
|
||||
BENCH_BIT_SIZES.to_vec()
|
||||
}
|
||||
@@ -397,7 +400,15 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
|
||||
elements.min(1500) // This threshold is useful for operation with both a small number of
|
||||
// block and low PBs count.
|
||||
}
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
#[cfg(feature = "hpu")]
|
||||
{
|
||||
// NB: unused with HPU
|
||||
let _ = minimum_loading;
|
||||
let _ = op_pbs_count;
|
||||
// Enforce that a minimum of 64 IOp is sent
|
||||
block_multiplicator.min(64.0) as u64
|
||||
}
|
||||
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
|
||||
{
|
||||
let num_threads = rayon::current_num_threads() as f64;
|
||||
let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);
|
||||
|
||||
Reference in New Issue
Block a user