feat: add KVStore to the high level api

* Added Value type name to crate::integer::KVStore impl of Named trait
  as well as a bool to check we deserialize the correct value type
  (Radix vs SignedRadix)
* Add KVStore to high_level_api
* Add KVStore hlapi benches
* Remove specialized `[add,mul,sub]_to_slot` as `map` is now the
  intended API.
    - mul_to_slot was way slower than using `map`
    - add/mul_to_slot were a bit faster (~5% latency-wise), but returned
      less information (no old_value, no new_value, no boolean to check)
      if the key matched
    - Some known improvement can be made to map, which should result in
      it being better than add/sub_to_slot
* Add FheIntegerType trait to make the KVStore generic over
  FheUint/FheInt, and should make GPU integration "easy"
This commit is contained in:
Thomas Montaigu
2025-09-29 11:34:47 +02:00
committed by tmontaigu
parent 33dee7673c
commit e523fd2cb6
17 changed files with 1271 additions and 430 deletions

View File

@@ -70,7 +70,7 @@ required-features = ["shortint", "internal-keycache"]
name = "hlapi"
path = "benches/high_level_api/bench.rs"
harness = false
required-features = ["integer", "internal-keycache"]
required-features = ["integer", "internal-keycache", "pbs-stats"]
[[bench]]
name = "hlapi-erc20"

View File

@@ -1,14 +1,22 @@
use benchmark::utilities::{write_to_json, OperatorType};
use criterion::{black_box, Criterion};
use benchmark::utilities::{hlapi_throughput_num_ops, write_to_json, BenchmarkType, OperatorType};
use criterion::{black_box, Criterion, Throughput};
use rand::prelude::*;
use std::hash::Hash;
use std::marker::PhantomData;
use std::ops::*;
use tfhe::core_crypto::prelude::Numeric;
use tfhe::integer::block_decomposition::DecomposableInto;
use tfhe::keycache::NamedParam;
use tfhe::named::Named;
use tfhe::prelude::*;
use tfhe::{
ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
ClientKey, CompressedServerKey, FheIntegerType, FheUint10, FheUint12, FheUint128, FheUint14,
FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, FheUintId, IntegerId,
KVStore,
};
use rayon::prelude::*;
fn bench_fhe_type<FheType>(
c: &mut Criterion,
client_key: &ClientKey,
@@ -225,6 +233,170 @@ bench_type!(FheUint32);
bench_type!(FheUint64);
bench_type!(FheUint128);
trait TypeDisplay {
fn fmt(f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let name = std::any::type_name::<Self>();
let pos = name.rfind(":").map_or(0, |p| p + 1);
write!(f, "{}", &name[pos..])
}
}
impl TypeDisplay for u8 {}
impl TypeDisplay for u16 {}
impl TypeDisplay for u32 {}
impl TypeDisplay for u64 {}
impl TypeDisplay for u128 {}
impl<Id: FheUintId> TypeDisplay for tfhe::FheUint<Id> {
fn fmt(f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write_fhe_type_name::<Self>(f)
}
}
impl<Id: tfhe::FheIntId> TypeDisplay for tfhe::FheInt<Id> {
fn fmt(f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write_fhe_type_name::<Self>(f)
}
}
struct TypeDisplayer<T: TypeDisplay>(PhantomData<T>);
impl<T: TypeDisplay> Default for TypeDisplayer<T> {
fn default() -> Self {
Self(PhantomData)
}
}
impl<T: TypeDisplay> std::fmt::Display for TypeDisplayer<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
T::fmt(f)
}
}
fn write_fhe_type_name<'a, FheType>(f: &mut std::fmt::Formatter<'a>) -> std::fmt::Result
where
FheType: FheIntegerType + Named,
{
let full_name = FheType::NAME;
let i = full_name.rfind(":").map_or(0, |p| p + 1);
write!(f, "{}{}", &full_name[i..], FheType::Id::num_bits())
}
fn bench_kv_store<Key, FheKey, Value>(c: &mut Criterion, cks: &ClientKey, num_elements: usize)
where
rand::distributions::Standard: rand::distributions::Distribution<Key>,
Key: Numeric + DecomposableInto<u64> + Eq + Hash + CastInto<usize> + TypeDisplay,
Value: FheEncrypt<u128, ClientKey> + FheIntegerType + Clone + Send + Sync + TypeDisplay,
Value::Id: FheUintId,
FheKey: FheEncrypt<Key, ClientKey> + FheIntegerType + Send + Sync,
FheKey::Id: FheUintId,
{
let mut kv_store = KVStore::new();
let mut rng = rand::thread_rng();
let format_id_bench = |op_name: &str| -> String {
format!(
"KVStore::<{}, {}>::{op_name}/{num_elements}",
TypeDisplayer::<Key>::default(),
TypeDisplayer::<Value>::default(),
)
};
match BenchmarkType::from_env().unwrap() {
BenchmarkType::Latency => {
while kv_store.len() != num_elements {
let key = rng.gen::<Key>();
let value = rng.gen::<u128>();
let encrypted_value = Value::encrypt(value, cks);
kv_store.insert_with_clear_key(key, encrypted_value);
}
let key = rng.gen::<Key>();
let encrypted_key = FheKey::encrypt(key, cks);
let value = rng.gen::<u128>();
let value_to_add = Value::encrypt(value, cks);
c.bench_function(&format_id_bench("Get"), |b| {
b.iter(|| {
let _ = kv_store.get(&encrypted_key);
})
});
c.bench_function(&format_id_bench("Update"), |b| {
b.iter(|| {
let _ = kv_store.update(&encrypted_key, &value_to_add);
})
});
c.bench_function(&format_id_bench("Map"), |b| {
b.iter(|| {
kv_store.map(&encrypted_key, |v| v);
})
});
}
BenchmarkType::Throughput => {
while kv_store.len() != num_elements {
let key = rng.gen::<Key>();
let value = rng.gen::<u128>();
let encrypted_value = Value::encrypt(value, cks);
kv_store.insert_with_clear_key(key, encrypted_value);
}
let key = rng.gen::<Key>();
let encrypted_key = FheKey::encrypt(key, cks);
let value = rng.gen::<u128>();
let value_to_add = Value::encrypt(value, cks);
let factor = hlapi_throughput_num_ops(
|| {
kv_store.map(&encrypted_key, |v| v);
},
cks,
);
let mut kv_stores = vec![];
for _ in 0..factor.saturating_sub(1) {
kv_stores.push(kv_store.clone());
}
kv_stores.push(kv_store);
let mut group = c.benchmark_group("KVStore Throughput");
group.throughput(Throughput::Elements(kv_stores.len() as u64));
group.bench_function(format_id_bench("Map"), |b| {
b.iter(|| {
kv_stores.par_iter_mut().for_each(|kv_store| {
kv_store.map(&encrypted_key, |v| v);
})
})
});
group.bench_function(format_id_bench("Update"), |b| {
b.iter(|| {
kv_stores.par_iter_mut().for_each(|kv_store| {
kv_store.update(&encrypted_key, &value_to_add);
})
})
});
group.bench_function(format_id_bench("Get"), |b| {
b.iter(|| {
kv_stores.par_iter_mut().for_each(|kv_store| {
kv_store.get(&encrypted_key);
})
})
});
group.finish();
}
}
}
fn main() {
#[cfg(feature = "hpu")]
let cks = {
@@ -256,7 +428,9 @@ fn main() {
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);
set_server_key(compressed_sks.decompress());
let sks = compressed_sks.decompress();
rayon::broadcast(|_| set_server_key(sks.clone()));
set_server_key(sks);
cks
};
@@ -274,5 +448,17 @@ fn main() {
bench_fhe_uint64(&mut c, &cks);
bench_fhe_uint128(&mut c, &cks);
for pow in 1..=10 {
bench_kv_store::<u64, FheUint64, FheUint32>(&mut c, &cks, 1 << pow);
}
for pow in 1..=10 {
bench_kv_store::<u64, FheUint64, FheUint64>(&mut c, &cks, 1 << pow);
}
for pow in 1..=10 {
bench_kv_store::<u128, FheUint128, FheUint64>(&mut c, &cks, 1 << pow);
}
c.final_summary();
}

View File

@@ -5,6 +5,8 @@ use std::{env, fs};
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::{get_number_of_gpus, get_number_of_sms};
use tfhe::core_crypto::prelude::*;
#[cfg(feature = "integer")]
use tfhe::prelude::*;
#[cfg(feature = "boolean")]
pub mod boolean_utils {
@@ -466,6 +468,39 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
}
}
// Given an `Op` this returns how many more ops should be done in parallel
// to saturate the CPU and have a better throughput measurement
#[cfg(feature = "integer")]
pub fn hlapi_throughput_num_ops<Op>(op: Op, cks: &tfhe::ClientKey) -> usize
where
Op: FnOnce(),
{
tfhe::reset_pbs_count();
let t = std::time::Instant::now();
op();
let time_for_op = t.elapsed();
let pbs_count_for_op = tfhe::get_pbs_count();
let a = tfhe::FheBool::encrypt(true, cks);
let b = tfhe::FheBool::encrypt(true, cks);
let t = std::time::Instant::now();
let _ = a & b;
let time_for_single_pbs = t.elapsed();
// Round-up with nano seconds
let pbs_time_in_ms =
time_for_single_pbs.as_millis() + u128::from(time_for_single_pbs.as_nanos() != 0);
// Theoretical time if the op was just 1 layer of PBS all in parallel
let time_if_full_occupancy =
pbs_count_for_op.div_ceil(rayon::current_num_threads() as u64) as u128 * pbs_time_in_ms;
// Then find how many ops we should do to have full occupancy
let factor = time_for_op.as_millis().div_ceil(time_if_full_occupancy);
factor as usize
}
#[cfg(feature = "gpu")]
mod cuda_utils {
use tfhe::core_crypto::entities::{