Files
tfhe-rs/tfhe-benchmark/src/utilities.rs
Arthur Meyre 205b767fc1 chore: fix various target issues for benchmarks following renames
- renames were done to uniformize and make it easier to setup perf
regression measurements, some names were not updated this PR fixes that
2025-10-20 13:45:27 +02:00

747 lines
27 KiB
Rust

use serde::Serialize;
use std::path::PathBuf;
use std::sync::OnceLock;
use std::{env, fs};
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::{get_number_of_gpus, get_number_of_sms};
use tfhe::core_crypto::prelude::*;
#[cfg(feature = "integer")]
use tfhe::prelude::*;
#[cfg(feature = "boolean")]
pub mod boolean_utils {
use super::*;
use tfhe::boolean::parameters::BooleanParameters;
impl From<BooleanParameters> for CryptoParametersRecord<u32> {
fn from(params: BooleanParameters) -> Self {
CryptoParametersRecord {
lwe_dimension: Some(params.lwe_dimension),
glwe_dimension: Some(params.glwe_dimension),
polynomial_size: Some(params.polynomial_size),
lwe_noise_distribution: Some(params.lwe_noise_distribution),
glwe_noise_distribution: Some(params.glwe_noise_distribution),
pbs_base_log: Some(params.pbs_base_log),
pbs_level: Some(params.pbs_level),
ks_base_log: Some(params.ks_base_log),
ks_level: Some(params.ks_level),
ciphertext_modulus: Some(CiphertextModulus::<u32>::new_native()),
..Default::default()
}
}
}
}
#[allow(unused_imports)]
#[cfg(feature = "boolean")]
pub use boolean_utils::*;
#[cfg(feature = "shortint")]
pub mod shortint_utils {
use super::*;
use tfhe::shortint::parameters::compact_public_key_only::CompactPublicKeyEncryptionParameters;
use tfhe::shortint::parameters::list_compression::CompressionParameters;
use tfhe::shortint::parameters::{
NoiseSquashingCompressionParameters, NoiseSquashingParameters,
ShortintKeySwitchingParameters,
};
use tfhe::shortint::{AtomicPatternParameters, PBSParameters};
impl From<PBSParameters> for CryptoParametersRecord<u64> {
fn from(params: PBSParameters) -> Self {
AtomicPatternParameters::from(params).into()
}
}
impl From<AtomicPatternParameters> for CryptoParametersRecord<u64> {
fn from(params: AtomicPatternParameters) -> Self {
CryptoParametersRecord {
lwe_dimension: Some(params.lwe_dimension()),
glwe_dimension: Some(params.glwe_dimension()),
polynomial_size: Some(params.polynomial_size()),
lwe_noise_distribution: Some(params.lwe_noise_distribution()),
glwe_noise_distribution: Some(params.glwe_noise_distribution()),
pbs_base_log: Some(params.pbs_base_log()),
pbs_level: Some(params.pbs_level()),
ks_base_log: Some(params.ks_base_log()),
ks_level: Some(params.ks_level()),
message_modulus: Some(params.message_modulus().0),
carry_modulus: Some(params.carry_modulus().0),
ciphertext_modulus: Some(
params
.ciphertext_modulus()
.try_to()
.expect("failed to convert ciphertext modulus"),
),
error_probability: Some(2f64.powf(params.log2_p_fail())),
..Default::default()
}
}
}
impl From<ShortintKeySwitchingParameters> for CryptoParametersRecord<u64> {
fn from(params: ShortintKeySwitchingParameters) -> Self {
CryptoParametersRecord {
ks_base_log: Some(params.ks_base_log),
ks_level: Some(params.ks_level),
..Default::default()
}
}
}
impl From<CompactPublicKeyEncryptionParameters> for CryptoParametersRecord<u64> {
fn from(params: CompactPublicKeyEncryptionParameters) -> Self {
CryptoParametersRecord {
message_modulus: Some(params.message_modulus.0),
carry_modulus: Some(params.carry_modulus.0),
ciphertext_modulus: Some(params.ciphertext_modulus),
..Default::default()
}
}
}
impl From<(CompressionParameters, AtomicPatternParameters)> for CryptoParametersRecord<u64> {
fn from(
(comp_params, pbs_params): (CompressionParameters, AtomicPatternParameters),
) -> Self {
CryptoParametersRecord {
lwe_dimension: Some(pbs_params.lwe_dimension()),
br_level: Some(comp_params.br_level),
br_base_log: Some(comp_params.br_base_log),
packing_ks_level: Some(comp_params.packing_ks_level),
packing_ks_base_log: Some(comp_params.packing_ks_base_log),
packing_ks_polynomial_size: Some(comp_params.packing_ks_polynomial_size),
packing_ks_glwe_dimension: Some(comp_params.packing_ks_glwe_dimension),
lwe_per_glwe: Some(comp_params.lwe_per_glwe),
storage_log_modulus: Some(comp_params.storage_log_modulus),
lwe_noise_distribution: Some(pbs_params.encryption_noise_distribution()),
packing_ks_key_noise_distribution: Some(
comp_params.packing_ks_key_noise_distribution,
),
ciphertext_modulus: Some(pbs_params.ciphertext_modulus()),
error_probability: Some(2f64.powf(pbs_params.log2_p_fail())),
..Default::default()
}
}
}
impl From<(NoiseSquashingParameters, AtomicPatternParameters)> for CryptoParametersRecord<u64> {
fn from(
(noise_squash_params, pbs_params): (NoiseSquashingParameters, AtomicPatternParameters),
) -> Self {
CryptoParametersRecord {
lwe_dimension: Some(pbs_params.lwe_dimension()),
glwe_dimension: Some(noise_squash_params.glwe_dimension()),
polynomial_size: Some(noise_squash_params.polynomial_size()),
pbs_level: Some(noise_squash_params.decomp_level_count()),
pbs_base_log: Some(noise_squash_params.decomp_base_log()),
lwe_noise_distribution: Some(pbs_params.encryption_noise_distribution()),
message_modulus: Some(noise_squash_params.message_modulus().0),
carry_modulus: Some(noise_squash_params.carry_modulus().0),
error_probability: Some(2f64.powf(pbs_params.log2_p_fail())),
..Default::default()
}
}
}
impl From<(NoiseSquashingCompressionParameters, AtomicPatternParameters)>
for CryptoParametersRecord<u64>
{
fn from(
(comp_params, pbs_params): (
NoiseSquashingCompressionParameters,
AtomicPatternParameters,
),
) -> Self {
CryptoParametersRecord {
lwe_dimension: Some(pbs_params.lwe_dimension()),
br_level: None,
br_base_log: None,
packing_ks_level: Some(comp_params.packing_ks_level),
packing_ks_base_log: Some(comp_params.packing_ks_base_log),
packing_ks_polynomial_size: Some(comp_params.packing_ks_polynomial_size),
packing_ks_glwe_dimension: Some(comp_params.packing_ks_glwe_dimension),
lwe_per_glwe: Some(comp_params.lwe_per_glwe),
storage_log_modulus: Some(comp_params.ciphertext_modulus.into_modulus_log()),
lwe_noise_distribution: Some(pbs_params.encryption_noise_distribution()),
packing_ks_key_noise_distribution: None,
ciphertext_modulus: Some(pbs_params.ciphertext_modulus()),
error_probability: Some(2f64.powf(pbs_params.log2_p_fail())),
..Default::default()
}
}
}
}
#[allow(unused_imports)]
#[cfg(feature = "shortint")]
pub use shortint_utils::*;
#[derive(Clone, Copy, Default, Serialize)]
pub struct CryptoParametersRecord<Scalar: UnsignedInteger> {
pub lwe_dimension: Option<LweDimension>,
pub glwe_dimension: Option<GlweDimension>,
pub packing_ks_glwe_dimension: Option<GlweDimension>,
pub polynomial_size: Option<PolynomialSize>,
pub packing_ks_polynomial_size: Option<PolynomialSize>,
#[serde(serialize_with = "CryptoParametersRecord::serialize_distribution")]
pub lwe_noise_distribution: Option<DynamicDistribution<Scalar>>,
#[serde(serialize_with = "CryptoParametersRecord::serialize_distribution")]
pub glwe_noise_distribution: Option<DynamicDistribution<Scalar>>,
#[serde(serialize_with = "CryptoParametersRecord::serialize_distribution")]
pub packing_ks_key_noise_distribution: Option<DynamicDistribution<Scalar>>,
pub pbs_base_log: Option<DecompositionBaseLog>,
pub pbs_level: Option<DecompositionLevelCount>,
pub ks_base_log: Option<DecompositionBaseLog>,
pub ks_level: Option<DecompositionLevelCount>,
pub pfks_level: Option<DecompositionLevelCount>,
pub pfks_base_log: Option<DecompositionBaseLog>,
pub pfks_std_dev: Option<StandardDev>,
pub cbs_level: Option<DecompositionLevelCount>,
pub cbs_base_log: Option<DecompositionBaseLog>,
pub br_level: Option<DecompositionLevelCount>,
pub br_base_log: Option<DecompositionBaseLog>,
pub packing_ks_level: Option<DecompositionLevelCount>,
pub packing_ks_base_log: Option<DecompositionBaseLog>,
pub message_modulus: Option<u64>,
pub carry_modulus: Option<u64>,
pub ciphertext_modulus: Option<CiphertextModulus<Scalar>>,
pub lwe_per_glwe: Option<LweCiphertextCount>,
pub storage_log_modulus: Option<CiphertextModulusLog>,
pub error_probability: Option<f64>,
}
impl<Scalar: UnsignedInteger> CryptoParametersRecord<Scalar> {
pub fn noise_distribution_as_string(noise_distribution: DynamicDistribution<Scalar>) -> String {
match noise_distribution {
DynamicDistribution::Gaussian(g) => format!("Gaussian({}, {})", g.std, g.mean),
DynamicDistribution::TUniform(t) => format!("TUniform({})", t.bound_log2()),
}
}
pub fn serialize_distribution<S>(
noise_distribution: &Option<DynamicDistribution<Scalar>>,
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
match noise_distribution {
Some(d) => serializer.serialize_some(&Self::noise_distribution_as_string(*d)),
None => serializer.serialize_none(),
}
}
}
#[derive(Serialize)]
enum PolynomialMultiplication {
Fft,
// Ntt,
}
#[derive(Serialize)]
enum IntegerRepresentation {
Radix,
// Crt,
// Hybrid,
}
#[derive(Serialize)]
enum ExecutionType {
Sequential,
Parallel,
}
#[derive(Serialize)]
enum KeySetType {
Single,
// Multi,
}
#[derive(Serialize)]
enum OperandType {
CipherText,
PlainText,
}
#[derive(Clone, Serialize)]
pub enum OperatorType {
Atomic,
// AtomicPattern,
}
#[derive(Serialize)]
struct BenchmarkParametersRecord<Scalar: UnsignedInteger> {
display_name: String,
crypto_parameters_alias: String,
crypto_parameters: CryptoParametersRecord<Scalar>,
message_modulus: Option<u64>,
carry_modulus: Option<u64>,
ciphertext_modulus: usize,
bit_size: u32,
polynomial_multiplication: PolynomialMultiplication,
precision: u32,
error_probability: f64,
integer_representation: IntegerRepresentation,
decomposition_basis: Vec<u32>,
pbs_algorithm: Option<String>,
execution_type: ExecutionType,
key_set_type: KeySetType,
operand_type: OperandType,
operator_type: OperatorType,
}
/// Writes benchmarks parameters to disk in JSON format.
pub fn write_to_json<
Scalar: UnsignedInteger + Serialize,
T: Into<CryptoParametersRecord<Scalar>>,
>(
bench_id: &str,
params: T,
params_alias: impl Into<String>,
display_name: impl Into<String>,
operator_type: &OperatorType,
bit_size: u32,
decomposition_basis: Vec<u32>,
) {
let params = params.into();
let execution_type = match bench_id.contains("parallelized") {
true => ExecutionType::Parallel,
false => ExecutionType::Sequential,
};
let operand_type = match bench_id.contains("scalar") {
true => OperandType::PlainText,
false => OperandType::CipherText,
};
let record = BenchmarkParametersRecord {
display_name: display_name.into(),
crypto_parameters_alias: params_alias.into(),
crypto_parameters: params.to_owned(),
message_modulus: params.message_modulus,
carry_modulus: params.carry_modulus,
ciphertext_modulus: 64,
bit_size,
polynomial_multiplication: PolynomialMultiplication::Fft,
precision: (params.message_modulus.unwrap_or(2) as u32).ilog2(),
error_probability: params.error_probability.unwrap_or(2f64.powf(-41.0)),
integer_representation: IntegerRepresentation::Radix,
decomposition_basis,
pbs_algorithm: None, // To be added in future version
execution_type,
key_set_type: KeySetType::Single,
operand_type,
operator_type: operator_type.to_owned(),
};
let mut params_directory = ["benchmarks_parameters", bench_id]
.iter()
.collect::<PathBuf>();
fs::create_dir_all(&params_directory).unwrap();
params_directory.push("parameters.json");
fs::write(params_directory, serde_json::to_string(&record).unwrap()).unwrap();
}
const FAST_BENCH_BIT_SIZES: [usize; 1] = [64];
#[cfg(not(feature = "gpu"))]
const BENCH_BIT_SIZES: [usize; 8] = [2, 8, 16, 32, 40, 64, 128, 256];
#[cfg(feature = "gpu")]
const BENCH_BIT_SIZES: [usize; 7] = [8, 16, 32, 40, 64, 128, 256];
const HPU_BENCH_BIT_SIZES: [usize; 5] = [8, 16, 32, 64, 128];
const MULTI_BIT_CPU_SIZES: [usize; 5] = [8, 16, 32, 40, 64];
/// User configuration in which benchmarks must be run.
#[derive(Default)]
pub struct EnvConfig {
pub is_multi_bit: bool,
pub is_fast_bench: bool,
}
impl EnvConfig {
pub fn new() -> Self {
let is_multi_bit = match env::var("__TFHE_RS_PARAM_TYPE") {
Ok(val) => val.to_lowercase() == "multi_bit",
Err(_) => false,
};
let is_fast_bench = match env::var("__TFHE_RS_FAST_BENCH") {
Ok(val) => val.to_lowercase() == "true",
Err(_) => false,
};
EnvConfig {
is_multi_bit,
is_fast_bench,
}
}
/// Get precisions values to benchmark.
pub fn bit_sizes(&self) -> Vec<usize> {
if self.is_fast_bench {
FAST_BENCH_BIT_SIZES.to_vec()
} else if self.is_multi_bit {
if cfg!(feature = "gpu") {
BENCH_BIT_SIZES.to_vec()
} else {
MULTI_BIT_CPU_SIZES.to_vec()
}
} else if cfg!(feature = "hpu") {
HPU_BENCH_BIT_SIZES.to_vec()
} else {
BENCH_BIT_SIZES.to_vec()
}
}
}
pub static BENCH_TYPE: OnceLock<BenchmarkType> = OnceLock::new();
pub enum BenchmarkType {
Latency,
Throughput,
}
impl BenchmarkType {
pub fn from_env() -> Result<Self, String> {
let raw_value = env::var("__TFHE_RS_BENCH_TYPE").unwrap_or("latency".to_string());
match raw_value.to_lowercase().as_str() {
"latency" => Ok(BenchmarkType::Latency),
"throughput" => Ok(BenchmarkType::Throughput),
_ => Err(format!("benchmark type '{raw_value}' is not supported")),
}
}
}
pub fn get_bench_type() -> &'static BenchmarkType {
BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap())
}
/// Generate a number of threads to use to saturate current machine for throughput measurements.
pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0);
// Some operations with a high serial workload (e.g. division) would yield an operation
// loading value so low that the number of elements in the end wouldn't be meaningful.
let minimum_loading = if num_block < 64 { 1.0 } else { 0.015 };
#[cfg(feature = "gpu")]
{
let num_sms_per_gpu = get_number_of_sms();
let total_num_sm = num_sms_per_gpu * get_number_of_gpus();
let total_blocks_per_sm = 4u64; // Assume each SM can handle 4 blocks concurrently
let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
let block_factor = ((2.0f64 * num_block as f64) / 4.0f64).ceil() as u64;
let elements_per_wave = total_blocks_per_sm * total_num_sm as u64 / block_factor;
// We need to enable the new load for pbs benches and for sizes larger than 16 blocks in
// demanding operations for the rest of operations we maintain a minimum of 200
// elements
let min_elements = if op_pbs_count == 1
|| (op_pbs_count > (num_block * num_block) as u64 && num_block >= 16)
{
elements_per_wave * min_num_waves
} else {
200u64
};
let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
elements.min(min_elements) // This threshold is useful for operation
// with both a small number of
// block and low PBs count.
}
#[cfg(feature = "hpu")]
{
// NB: unused with HPU
let _ = minimum_loading;
let _ = op_pbs_count;
// Enforce that a minimum of 64 IOp is sent
block_multiplicator.min(64.0) as u64
}
#[cfg(not(any(feature = "gpu", feature = "hpu")))]
{
let num_threads = rayon::current_num_threads() as f64;
let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading);
// Add 20% more to maximum threads available.
((num_threads + (num_threads * 0.2)) * block_multiplicator.min(1.0) * operation_loading)
as u64
}
}
// Given an `Op` this returns how many more ops should be done in parallel
// to saturate the CPU and have a better throughput measurement
#[cfg(all(feature = "integer", feature = "pbs-stats"))]
pub fn hlapi_throughput_num_ops<Op>(op: Op, cks: &tfhe::ClientKey) -> usize
where
Op: FnOnce(),
{
tfhe::reset_pbs_count();
let t = std::time::Instant::now();
op();
let time_for_op = t.elapsed();
let pbs_count_for_op = tfhe::get_pbs_count();
let a = tfhe::FheBool::encrypt(true, cks);
let b = tfhe::FheBool::encrypt(true, cks);
let t = std::time::Instant::now();
let _ = a & b;
let time_for_single_pbs = t.elapsed();
// Round-up with nano seconds
let pbs_time_in_ms =
time_for_single_pbs.as_millis() + u128::from(time_for_single_pbs.as_nanos() != 0);
// Theoretical time if the op was just 1 layer of PBS all in parallel
let time_if_full_occupancy =
pbs_count_for_op.div_ceil(rayon::current_num_threads() as u64) as u128 * pbs_time_in_ms;
// Then find how many ops we should do to have full occupancy
let factor = time_for_op.as_millis().div_ceil(time_if_full_occupancy);
factor as usize
}
#[cfg(feature = "gpu")]
mod cuda_utils {
use tfhe::core_crypto::entities::{
LweBootstrapKeyOwned, LweKeyswitchKeyOwned, LweMultiBitBootstrapKeyOwned,
LwePackingKeyswitchKeyOwned,
};
use tfhe::core_crypto::gpu::lwe_bootstrap_key::{
CudaLweBootstrapKey, CudaModulusSwitchNoiseReductionConfiguration,
};
use tfhe::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
use tfhe::core_crypto::gpu::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
use tfhe::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
use tfhe::core_crypto::gpu::vec::{CudaVec, GpuIndex};
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
use tfhe::core_crypto::prelude::{Numeric, UnsignedInteger};
pub const GPU_MAX_SUPPORTED_POLYNOMIAL_SIZE: usize = 16384;
/// Get vector of CUDA streams that can be directly used for throughput benchmarks in
/// core_crypto layer.
pub fn cuda_local_streams_core() -> Vec<CudaStreams> {
(0..get_number_of_gpus())
.map(|i| CudaStreams::new_single_gpu(GpuIndex::new(i)))
.collect::<Vec<_>>()
}
/// Computing keys in their CPU flavor.
pub struct CpuKeys<T: UnsignedInteger> {
ksk: Option<LweKeyswitchKeyOwned<T>>,
pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
bsk: Option<LweBootstrapKeyOwned<T>>,
multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
}
impl<T: UnsignedInteger> CpuKeys<T> {
pub fn builder() -> CpuKeysBuilder<T> {
CpuKeysBuilder::new()
}
}
pub struct CpuKeysBuilder<T: UnsignedInteger> {
ksk: Option<LweKeyswitchKeyOwned<T>>,
pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
bsk: Option<LweBootstrapKeyOwned<T>>,
multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
}
impl<T: UnsignedInteger> CpuKeysBuilder<T> {
pub fn new() -> CpuKeysBuilder<T> {
Self {
ksk: None,
pksk: None,
bsk: None,
multi_bit_bsk: None,
}
}
pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<T>) -> CpuKeysBuilder<T> {
self.ksk = Some(ksk);
self
}
pub fn packing_keyswitch_key(
mut self,
pksk: LwePackingKeyswitchKeyOwned<T>,
) -> CpuKeysBuilder<T> {
self.pksk = Some(pksk);
self
}
pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T> {
self.bsk = Some(bsk);
self
}
pub fn multi_bit_bootstrap_key(
mut self,
mb_bsk: LweMultiBitBootstrapKeyOwned<T>,
) -> CpuKeysBuilder<T> {
self.multi_bit_bsk = Some(mb_bsk);
self
}
pub fn build(self) -> CpuKeys<T> {
CpuKeys {
ksk: self.ksk,
pksk: self.pksk,
bsk: self.bsk,
multi_bit_bsk: self.multi_bit_bsk,
}
}
}
impl<T: UnsignedInteger> Default for CpuKeysBuilder<T> {
fn default() -> Self {
Self::new()
}
}
/// Computing keys in their Cuda flavor.
#[allow(dead_code)]
pub struct CudaLocalKeys<T: UnsignedInteger> {
pub ksk: Option<CudaLweKeyswitchKey<T>>,
pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
pub bsk: Option<CudaLweBootstrapKey>,
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
}
#[allow(dead_code)]
impl<T: UnsignedInteger> CudaLocalKeys<T> {
pub fn from_cpu_keys(
cpu_keys: &CpuKeys<T>,
ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
stream: &CudaStreams,
) -> Self {
Self {
ksk: cpu_keys
.ksk
.as_ref()
.map(|ksk| CudaLweKeyswitchKey::from_lwe_keyswitch_key(ksk, stream)),
pksk: cpu_keys.pksk.as_ref().map(|pksk| {
CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(pksk, stream)
}),
bsk: cpu_keys.bsk.as_ref().map(|bsk| {
CudaLweBootstrapKey::from_lwe_bootstrap_key(bsk, ms_noise_reduction, stream)
}),
multi_bit_bsk: cpu_keys.multi_bit_bsk.as_ref().map(|mb_bsk| {
CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(mb_bsk, stream)
}),
}
}
}
/// Instantiate Cuda computing keys to each available GPU.
pub fn cuda_local_keys_core<T: UnsignedInteger>(
cpu_keys: &CpuKeys<T>,
ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
) -> Vec<CudaLocalKeys<T>> {
let gpu_count = get_number_of_gpus() as usize;
let mut gpu_keys_vec = Vec::with_capacity(gpu_count);
for i in 0..gpu_count {
let stream = CudaStreams::new_single_gpu(GpuIndex::new(i as u32));
gpu_keys_vec.push(CudaLocalKeys::from_cpu_keys(
cpu_keys,
ms_noise_reduction.clone(),
&stream,
));
}
gpu_keys_vec
}
pub struct CudaIndexes<T: Numeric> {
pub d_input: CudaVec<T>,
pub d_output: CudaVec<T>,
pub d_lut: CudaVec<T>,
}
impl<T: Numeric> CudaIndexes<T> {
pub fn new(indexes: &[T], stream: &CudaStreams, stream_index: u32) -> Self {
let length = indexes.len();
let mut d_input = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
let mut d_output = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
let mut d_lut = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
let zeros = vec![T::ZERO; length];
unsafe {
d_input.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
d_output.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
d_lut.copy_from_cpu_async(zeros.as_ref(), stream, stream_index);
}
stream.synchronize();
Self {
d_input,
d_output,
d_lut,
}
}
}
#[cfg(feature = "integer")]
pub mod cuda_integer_utils {
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
use tfhe::integer::gpu::CudaServerKey;
use tfhe::integer::ClientKey;
use tfhe::{set_server_key, CompressedServerKey, GpuIndex};
/// Get number of streams usable for CUDA throughput benchmarks
fn cuda_num_streams(num_block: usize) -> u64 {
let num_streams_per_gpu: u32 = match num_block {
2 => 64,
4 => 32,
8 => 16,
16 => 8,
32 => 4,
64 => 2,
128 => 1,
_ => 8,
};
(num_streams_per_gpu * get_number_of_gpus()) as u64
}
/// Get vector of CUDA streams that can be directly used for throughput benchmarks.
pub fn cuda_local_streams(
num_block: usize,
throughput_elements: usize,
) -> Vec<CudaStreams> {
(0..cuda_num_streams(num_block))
.map(|i| {
CudaStreams::new_single_gpu(GpuIndex::new(
(i % get_number_of_gpus() as u64) as u32,
))
})
.cycle()
.take(throughput_elements)
.collect::<Vec<_>>()
}
/// Instantiate Cuda server key to each available GPU.
pub fn cuda_local_keys(cks: &ClientKey) -> Vec<CudaServerKey> {
let gpu_count = get_number_of_gpus() as usize;
let mut gpu_sks_vec = Vec::with_capacity(gpu_count);
for i in 0..gpu_count {
let stream = CudaStreams::new_single_gpu(GpuIndex::new(i as u32));
gpu_sks_vec.push(CudaServerKey::new(cks, &stream));
}
gpu_sks_vec
}
pub fn configure_gpu(client_key: &tfhe::ClientKey) {
let compressed_sks = CompressedServerKey::new(client_key);
let sks = compressed_sks.decompress_to_gpu();
rayon::broadcast(|_| set_server_key(sks.clone()));
set_server_key(sks);
}
}
#[allow(unused_imports)]
#[cfg(feature = "integer")]
pub use cuda_integer_utils::*;
}
#[cfg(feature = "gpu")]
pub use cuda_utils::*;