mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 22:57:59 -05:00
fix(gpu): fix 128-bit compression benchmark
This commit is contained in:
7
Makefile
7
Makefile
@@ -1390,6 +1390,13 @@ bench_integer_compression_gpu: install_rs_check_toolchain
|
|||||||
--bench integer-glwe_packing_compression \
|
--bench integer-glwe_packing_compression \
|
||||||
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
|
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
|
||||||
|
|
||||||
|
.PHONY: bench_integer_compression_128b_gpu
|
||||||
|
bench_integer_compression_128b_gpu: install_rs_check_toolchain
|
||||||
|
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||||
|
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||||
|
--bench glwe_packing_compression_128b-integer-bench \
|
||||||
|
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
|
||||||
|
|
||||||
.PHONY: bench_integer_zk_gpu
|
.PHONY: bench_integer_zk_gpu
|
||||||
bench_integer_zk_gpu: install_rs_check_toolchain
|
bench_integer_zk_gpu: install_rs_check_toolchain
|
||||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||||
|
|||||||
@@ -96,6 +96,12 @@ path = "benches/integer/glwe_packing_compression.rs"
|
|||||||
harness = false
|
harness = false
|
||||||
required-features = ["integer", "pbs-stats", "internal-keycache"]
|
required-features = ["integer", "pbs-stats", "internal-keycache"]
|
||||||
|
|
||||||
|
[[bench]]
|
||||||
|
name = "glwe_packing_compression_128b-integer-bench"
|
||||||
|
path = "benches/integer/glwe_packing_compression_128b.rs"
|
||||||
|
harness = false
|
||||||
|
required-features = ["integer", "pbs-stats", "internal-keycache"]
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "integer"
|
name = "integer"
|
||||||
path = "benches/integer/bench.rs"
|
path = "benches/integer/bench.rs"
|
||||||
|
|||||||
@@ -159,20 +159,30 @@ fn cpu_glwe_packing(c: &mut Criterion) {
|
|||||||
mod cuda {
|
mod cuda {
|
||||||
use super::*;
|
use super::*;
|
||||||
use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
|
use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
|
||||||
use itertools::Itertools;
|
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
|
||||||
use std::cmp::max;
|
use tfhe::integer::compression_keys::CompressionPrivateKeys;
|
||||||
use tfhe::core_crypto::gpu::CudaStreams;
|
|
||||||
use tfhe::integer::ciphertext::NoiseSquashingCompressionPrivateKey;
|
|
||||||
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
|
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
|
||||||
use tfhe::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
|
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
|
||||||
use tfhe::integer::gpu::ciphertext::{
|
|
||||||
CudaCompressedSquashedNoiseCiphertextList, CudaUnsignedRadixCiphertext,
|
|
||||||
};
|
|
||||||
use tfhe::integer::gpu::gen_keys_radix_gpu;
|
use tfhe::integer::gpu::gen_keys_radix_gpu;
|
||||||
use tfhe::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
|
use tfhe::shortint::parameters::CompressionParameters;
|
||||||
use tfhe::integer::noise_squashing::NoiseSquashingPrivateKey;
|
use tfhe::shortint::PBSParameters;
|
||||||
|
|
||||||
fn gpu_glwe_packing(c: &mut Criterion) {
|
#[derive(Clone)]
|
||||||
|
struct BenchConfig {
|
||||||
|
param: PBSParameters,
|
||||||
|
comp_param: CompressionParameters,
|
||||||
|
bit_size: usize,
|
||||||
|
cks: ClientKey,
|
||||||
|
private_compression_key: CompressionPrivateKeys,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_num_elements_per_gpu(_bit_size: usize) -> usize {
|
||||||
|
// 200 elements per GPU seems enough to saturate H100s
|
||||||
|
// This is an empirical value and might need to be adjusted in the future
|
||||||
|
200
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_gpu_glwe_packing(c: &mut Criterion, config: BenchConfig) {
|
||||||
let bench_name = "integer::cuda::packing_compression";
|
let bench_name = "integer::cuda::packing_compression";
|
||||||
let mut bench_group = c.benchmark_group(bench_name);
|
let mut bench_group = c.benchmark_group(bench_name);
|
||||||
bench_group
|
bench_group
|
||||||
@@ -181,6 +191,279 @@ mod cuda {
|
|||||||
|
|
||||||
let stream = CudaStreams::new_multi_gpu();
|
let stream = CudaStreams::new_multi_gpu();
|
||||||
|
|
||||||
|
let BenchConfig {
|
||||||
|
param,
|
||||||
|
comp_param,
|
||||||
|
bit_size,
|
||||||
|
cks,
|
||||||
|
private_compression_key,
|
||||||
|
} = config;
|
||||||
|
|
||||||
|
let log_message_modulus = param.message_modulus().0.ilog2() as usize;
|
||||||
|
|
||||||
|
assert_eq!(bit_size % log_message_modulus, 0);
|
||||||
|
let num_blocks = bit_size / log_message_modulus;
|
||||||
|
|
||||||
|
let bench_id_pack;
|
||||||
|
|
||||||
|
match get_bench_type() {
|
||||||
|
BenchmarkType::Latency => {
|
||||||
|
// Generate and convert compression keys
|
||||||
|
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||||
|
let (compressed_compression_key, _) = radix_cks
|
||||||
|
.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||||
|
|
||||||
|
let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
||||||
|
|
||||||
|
builder.push(d_ct, &stream);
|
||||||
|
|
||||||
|
bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
|
||||||
|
bench_group.bench_function(&bench_id_pack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
let compressed = builder.build(&cuda_compression_key, &stream);
|
||||||
|
|
||||||
|
_ = black_box(compressed);
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
BenchmarkType::Throughput => {
|
||||||
|
// Generate and convert compression keys
|
||||||
|
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||||
|
let (compressed_compression_key, _) = radix_cks
|
||||||
|
.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||||
|
|
||||||
|
let elements_per_gpu = get_num_elements_per_gpu(bit_size) as u64;
|
||||||
|
let elements = elements_per_gpu * get_number_of_gpus() as u64;
|
||||||
|
|
||||||
|
let num_block =
|
||||||
|
(bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
|
||||||
|
bench_group.throughput(Throughput::Elements(elements));
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let local_streams = cuda_local_streams(num_block, elements as usize);
|
||||||
|
|
||||||
|
bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
|
||||||
|
let cuda_compression_key_vec = (0..get_number_of_gpus())
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i as usize];
|
||||||
|
compressed_compression_key.decompress_to_cuda(local_stream)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let builders = (0..elements)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let local_stream = &local_streams[i as usize % local_streams.len()];
|
||||||
|
let d_ct =
|
||||||
|
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
|
||||||
|
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
||||||
|
builder.push(d_ct, local_stream);
|
||||||
|
|
||||||
|
builder
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
bench_group.bench_function(&bench_id_pack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
builders.par_iter().enumerate().for_each(|(i, builder)| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
let cuda_compression_key =
|
||||||
|
&cuda_compression_key_vec[i % get_number_of_gpus() as usize];
|
||||||
|
|
||||||
|
let _ = builder.build(cuda_compression_key, local_stream);
|
||||||
|
})
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write_to_json::<u64, _>(
|
||||||
|
&bench_id_pack,
|
||||||
|
(comp_param, param.into()),
|
||||||
|
comp_param.name(),
|
||||||
|
"pack",
|
||||||
|
&OperatorType::Atomic,
|
||||||
|
bit_size as u32,
|
||||||
|
vec![param.message_modulus().0.ilog2(); num_blocks],
|
||||||
|
);
|
||||||
|
|
||||||
|
bench_group.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_gpu_glwe_unpacking(c: &mut Criterion, config: BenchConfig) {
|
||||||
|
let bench_name = "integer::cuda::packing_compression";
|
||||||
|
let mut bench_group = c.benchmark_group(bench_name);
|
||||||
|
bench_group
|
||||||
|
.sample_size(15)
|
||||||
|
.measurement_time(std::time::Duration::from_secs(30));
|
||||||
|
|
||||||
|
let stream = CudaStreams::new_multi_gpu();
|
||||||
|
|
||||||
|
let BenchConfig {
|
||||||
|
param,
|
||||||
|
comp_param,
|
||||||
|
bit_size,
|
||||||
|
cks,
|
||||||
|
private_compression_key,
|
||||||
|
} = config;
|
||||||
|
|
||||||
|
let log_message_modulus = param.message_modulus().0.ilog2() as usize;
|
||||||
|
|
||||||
|
assert_eq!(bit_size % log_message_modulus, 0);
|
||||||
|
let num_blocks = bit_size / log_message_modulus;
|
||||||
|
|
||||||
|
let bench_id_unpack;
|
||||||
|
|
||||||
|
match get_bench_type() {
|
||||||
|
BenchmarkType::Latency => {
|
||||||
|
// Generate and convert compression keys
|
||||||
|
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||||
|
let (compressed_compression_key, compressed_decompression_key) = radix_cks
|
||||||
|
.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||||
|
|
||||||
|
let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
|
||||||
|
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
|
||||||
|
radix_cks.parameters().glwe_dimension(),
|
||||||
|
radix_cks.parameters().polynomial_size(),
|
||||||
|
radix_cks.parameters().message_modulus(),
|
||||||
|
radix_cks.parameters().carry_modulus(),
|
||||||
|
radix_cks.parameters().ciphertext_modulus(),
|
||||||
|
&stream,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
||||||
|
|
||||||
|
builder.push(d_ct, &stream);
|
||||||
|
|
||||||
|
let compressed = builder.build(&cuda_compression_key, &stream);
|
||||||
|
|
||||||
|
bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
|
||||||
|
bench_group.bench_function(&bench_id_unpack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
let unpacked: CudaUnsignedRadixCiphertext = compressed
|
||||||
|
.get(0, &cuda_decompression_key, &stream)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
_ = black_box(unpacked);
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
BenchmarkType::Throughput => {
|
||||||
|
// Generate and convert compression keys
|
||||||
|
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||||
|
let (compressed_compression_key, compressed_decompression_key) = radix_cks
|
||||||
|
.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||||
|
|
||||||
|
let elements_per_gpu = get_num_elements_per_gpu(bit_size) as u64;
|
||||||
|
let elements = elements_per_gpu * get_number_of_gpus() as u64;
|
||||||
|
|
||||||
|
let num_block =
|
||||||
|
(bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
|
||||||
|
bench_group.throughput(Throughput::Elements(elements));
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let local_streams = cuda_local_streams(num_block, elements as usize);
|
||||||
|
|
||||||
|
bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
|
||||||
|
let builders = (0..elements)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let local_stream = &local_streams[i as usize % local_streams.len()];
|
||||||
|
let d_ct =
|
||||||
|
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
|
||||||
|
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
||||||
|
builder.push(d_ct, local_stream);
|
||||||
|
|
||||||
|
builder
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let cuda_compression_key_vec = (0..get_number_of_gpus())
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i as usize];
|
||||||
|
compressed_compression_key.decompress_to_cuda(local_stream)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let cuda_decompression_key_vec = (0..get_number_of_gpus())
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i as usize];
|
||||||
|
compressed_decompression_key.decompress_to_cuda(
|
||||||
|
radix_cks.parameters().glwe_dimension(),
|
||||||
|
radix_cks.parameters().polynomial_size(),
|
||||||
|
radix_cks.parameters().message_modulus(),
|
||||||
|
radix_cks.parameters().carry_modulus(),
|
||||||
|
radix_cks.parameters().ciphertext_modulus(),
|
||||||
|
local_stream,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let compressed = builders
|
||||||
|
.par_iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, builder)| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
let cuda_compression_key =
|
||||||
|
&cuda_compression_key_vec[i % get_number_of_gpus() as usize];
|
||||||
|
builder.build(cuda_compression_key, local_stream)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
bench_group.bench_function(&bench_id_unpack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
compressed.par_iter().enumerate().for_each(|(i, comp)| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
let cuda_decompression_key =
|
||||||
|
&cuda_decompression_key_vec[i % get_number_of_gpus() as usize];
|
||||||
|
|
||||||
|
let _ = comp
|
||||||
|
.get::<CudaUnsignedRadixCiphertext>(
|
||||||
|
0,
|
||||||
|
cuda_decompression_key,
|
||||||
|
local_stream,
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
})
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write_to_json::<u64, _>(
|
||||||
|
&bench_id_unpack,
|
||||||
|
(comp_param, param.into()),
|
||||||
|
comp_param.name(),
|
||||||
|
"unpack",
|
||||||
|
&OperatorType::Atomic,
|
||||||
|
bit_size as u32,
|
||||||
|
vec![param.message_modulus().0.ilog2(); num_blocks],
|
||||||
|
);
|
||||||
|
|
||||||
|
bench_group.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gpu_glwe_packing(c: &mut Criterion) {
|
||||||
let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
let comp_param =
|
let comp_param =
|
||||||
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
@@ -190,6 +473,13 @@ mod cuda {
|
|||||||
let cks = ClientKey::new(param);
|
let cks = ClientKey::new(param);
|
||||||
let private_compression_key = cks.new_compression_private_key(comp_param);
|
let private_compression_key = cks.new_compression_private_key(comp_param);
|
||||||
|
|
||||||
|
let mut config = BenchConfig {
|
||||||
|
param: tfhe::shortint::PBSParameters::MultiBitPBS(param),
|
||||||
|
comp_param,
|
||||||
|
cks,
|
||||||
|
private_compression_key,
|
||||||
|
bit_size: 0,
|
||||||
|
};
|
||||||
for bit_size in [
|
for bit_size in [
|
||||||
2,
|
2,
|
||||||
8,
|
8,
|
||||||
@@ -200,218 +490,28 @@ mod cuda {
|
|||||||
256,
|
256,
|
||||||
comp_param.lwe_per_glwe().0 * log_message_modulus,
|
comp_param.lwe_per_glwe().0 * log_message_modulus,
|
||||||
] {
|
] {
|
||||||
assert_eq!(bit_size % log_message_modulus, 0);
|
config.bit_size = bit_size;
|
||||||
let num_blocks = bit_size / log_message_modulus;
|
execute_gpu_glwe_packing(c, config.clone());
|
||||||
|
|
||||||
let bench_id_pack;
|
|
||||||
let bench_id_unpack;
|
|
||||||
|
|
||||||
// Generate and convert compression keys
|
|
||||||
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
|
||||||
let (compressed_compression_key, compressed_decompression_key) =
|
|
||||||
radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
|
|
||||||
|
|
||||||
match get_bench_type() {
|
|
||||||
BenchmarkType::Latency => {
|
|
||||||
let cuda_compression_key =
|
|
||||||
compressed_compression_key.decompress_to_cuda(&stream);
|
|
||||||
let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
|
|
||||||
radix_cks.parameters().glwe_dimension(),
|
|
||||||
radix_cks.parameters().polynomial_size(),
|
|
||||||
radix_cks.parameters().message_modulus(),
|
|
||||||
radix_cks.parameters().carry_modulus(),
|
|
||||||
radix_cks.parameters().ciphertext_modulus(),
|
|
||||||
&stream,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Encrypt
|
|
||||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
|
||||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
|
||||||
|
|
||||||
// Benchmark
|
|
||||||
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
|
||||||
|
|
||||||
builder.push(d_ct, &stream);
|
|
||||||
|
|
||||||
bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_pack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
let compressed = builder.build(&cuda_compression_key, &stream);
|
|
||||||
|
|
||||||
_ = black_box(compressed);
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
let compressed = builder.build(&cuda_compression_key, &stream);
|
|
||||||
|
|
||||||
bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_unpack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
let unpacked: CudaUnsignedRadixCiphertext = compressed
|
|
||||||
.get(0, &cuda_decompression_key, &stream)
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
_ = black_box(unpacked);
|
|
||||||
})
|
|
||||||
});
|
|
||||||
}
|
|
||||||
BenchmarkType::Throughput => {
|
|
||||||
// Execute the operation once to know its cost.
|
|
||||||
let (cpu_compression_key, cpu_decompression_key) =
|
|
||||||
cks.new_compression_decompression_keys(&private_compression_key);
|
|
||||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
|
||||||
let mut builder = CompressedCiphertextListBuilder::new();
|
|
||||||
builder.push(ct);
|
|
||||||
let compressed = builder.build(&cpu_compression_key);
|
|
||||||
|
|
||||||
reset_pbs_count();
|
|
||||||
// Use CPU operation as pbs_count do not count PBS on GPU backend.
|
|
||||||
let _: RadixCiphertext =
|
|
||||||
compressed.get(0, &cpu_decompression_key).unwrap().unwrap();
|
|
||||||
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
|
|
||||||
|
|
||||||
let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
|
|
||||||
.ceil() as usize;
|
|
||||||
let elements = throughput_num_threads(num_block, pbs_count);
|
|
||||||
bench_group.throughput(Throughput::Elements(elements));
|
|
||||||
|
|
||||||
// Encrypt
|
|
||||||
let local_streams = cuda_local_streams(num_block, elements as usize);
|
|
||||||
|
|
||||||
let cuda_compression_key_vec = local_streams
|
|
||||||
.iter()
|
|
||||||
.map(|local_stream| {
|
|
||||||
compressed_compression_key.decompress_to_cuda(local_stream)
|
|
||||||
})
|
|
||||||
.collect_vec();
|
|
||||||
let cuda_decompression_key_vec = local_streams
|
|
||||||
.iter()
|
|
||||||
.map(|local_stream| {
|
|
||||||
compressed_decompression_key.decompress_to_cuda(
|
|
||||||
radix_cks.parameters().glwe_dimension(),
|
|
||||||
radix_cks.parameters().polynomial_size(),
|
|
||||||
radix_cks.parameters().message_modulus(),
|
|
||||||
radix_cks.parameters().carry_modulus(),
|
|
||||||
radix_cks.parameters().ciphertext_modulus(),
|
|
||||||
local_stream,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect_vec();
|
|
||||||
|
|
||||||
// Benchmark
|
|
||||||
let builders = (0..elements)
|
|
||||||
.map(|i| {
|
|
||||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
|
||||||
let local_stream = &local_streams[i as usize % local_streams.len()];
|
|
||||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
|
|
||||||
&ct,
|
|
||||||
local_stream,
|
|
||||||
);
|
|
||||||
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
|
||||||
builder.push(d_ct, local_stream);
|
|
||||||
|
|
||||||
builder
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_pack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
builders.par_iter().enumerate().for_each(|(i, builder)| {
|
|
||||||
let local_stream = &local_streams[i % local_streams.len()];
|
|
||||||
let cuda_compression_key =
|
|
||||||
&cuda_compression_key_vec[i % local_streams.len()];
|
|
||||||
|
|
||||||
builder.build(cuda_compression_key, local_stream);
|
|
||||||
})
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
let compressed = builders
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, builder)| {
|
|
||||||
let local_stream = &local_streams[i % local_streams.len()];
|
|
||||||
let cuda_compression_key =
|
|
||||||
&cuda_compression_key_vec[i % local_streams.len()];
|
|
||||||
builder.build(cuda_compression_key, local_stream)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_unpack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
compressed.par_iter().enumerate().for_each(|(i, comp)| {
|
|
||||||
let local_stream = &local_streams[i % local_streams.len()];
|
|
||||||
let cuda_decompression_key =
|
|
||||||
&cuda_decompression_key_vec[i % local_streams.len()];
|
|
||||||
|
|
||||||
comp.get::<CudaUnsignedRadixCiphertext>(
|
|
||||||
0,
|
|
||||||
cuda_decompression_key,
|
|
||||||
local_stream,
|
|
||||||
)
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
})
|
|
||||||
})
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
write_to_json::<u64, _>(
|
|
||||||
&bench_id_pack,
|
|
||||||
(comp_param, param.into()),
|
|
||||||
comp_param.name(),
|
|
||||||
"pack",
|
|
||||||
&OperatorType::Atomic,
|
|
||||||
bit_size as u32,
|
|
||||||
vec![param.message_modulus.0.ilog2(); num_blocks],
|
|
||||||
);
|
|
||||||
|
|
||||||
write_to_json::<u64, _>(
|
|
||||||
&bench_id_unpack,
|
|
||||||
(comp_param, param.into()),
|
|
||||||
comp_param.name(),
|
|
||||||
"unpack",
|
|
||||||
&OperatorType::Atomic,
|
|
||||||
bit_size as u32,
|
|
||||||
vec![param.message_modulus.0.ilog2(); num_blocks],
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bench_group.finish()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn gpu_glwe_packing_128(c: &mut Criterion) {
|
fn gpu_glwe_unpacking(c: &mut Criterion) {
|
||||||
let bench_name = "integer::cuda::128b_packing_compression";
|
let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
let mut bench_group = c.benchmark_group(bench_name);
|
let comp_param =
|
||||||
bench_group
|
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
.sample_size(15)
|
|
||||||
.measurement_time(std::time::Duration::from_secs(30));
|
|
||||||
|
|
||||||
let stream = CudaStreams::new_multi_gpu();
|
|
||||||
|
|
||||||
let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
|
||||||
let noise_squashing_compression_parameters =
|
|
||||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
|
||||||
let noise_squashing_parameters =
|
|
||||||
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
|
||||||
|
|
||||||
let log_message_modulus = param.message_modulus.0.ilog2() as usize;
|
let log_message_modulus = param.message_modulus.0.ilog2() as usize;
|
||||||
|
|
||||||
let noise_squashing_compression_private_key =
|
let cks = ClientKey::new(param);
|
||||||
NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters);
|
let private_compression_key = cks.new_compression_private_key(comp_param);
|
||||||
let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters);
|
|
||||||
let noise_squashing_compression_key = noise_squashing_private_key
|
|
||||||
.new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
|
|
||||||
let cuda_noise_squashing_compression_key =
|
|
||||||
CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
|
|
||||||
&noise_squashing_compression_key,
|
|
||||||
&stream,
|
|
||||||
);
|
|
||||||
|
|
||||||
|
let mut config = BenchConfig {
|
||||||
|
param: PBSParameters::MultiBitPBS(param),
|
||||||
|
comp_param,
|
||||||
|
bit_size: 0,
|
||||||
|
cks,
|
||||||
|
private_compression_key,
|
||||||
|
};
|
||||||
for bit_size in [
|
for bit_size in [
|
||||||
2,
|
2,
|
||||||
8,
|
8,
|
||||||
@@ -419,180 +519,31 @@ mod cuda {
|
|||||||
32,
|
32,
|
||||||
64,
|
64,
|
||||||
128,
|
128,
|
||||||
// we don't need 256 here since
|
256,
|
||||||
// noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus == 256
|
comp_param.lwe_per_glwe().0 * log_message_modulus,
|
||||||
// with current parameters 256,
|
|
||||||
noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus,
|
|
||||||
] {
|
] {
|
||||||
assert_eq!(bit_size % log_message_modulus, 0);
|
config.bit_size = bit_size;
|
||||||
let num_blocks = bit_size / log_message_modulus;
|
execute_gpu_glwe_unpacking(c, config.clone());
|
||||||
|
|
||||||
let bench_id_pack;
|
|
||||||
let bench_id_unpack;
|
|
||||||
|
|
||||||
// Generate and convert compression keys
|
|
||||||
let cks = ClientKey::new(param);
|
|
||||||
let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
|
||||||
let compressed_noise_squashing_compression_key =
|
|
||||||
cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
|
|
||||||
|
|
||||||
match get_bench_type() {
|
|
||||||
BenchmarkType::Latency => {
|
|
||||||
let cuda_noise_squashing_key =
|
|
||||||
compressed_noise_squashing_compression_key.decompress_to_cuda(&stream);
|
|
||||||
|
|
||||||
// Encrypt
|
|
||||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
|
||||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
|
||||||
let d_ns_ct = cuda_noise_squashing_key
|
|
||||||
.squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// Benchmark
|
|
||||||
let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
|
|
||||||
|
|
||||||
builder.push(d_ns_ct, &stream);
|
|
||||||
|
|
||||||
bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_pack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
let compressed =
|
|
||||||
builder.build(&cuda_noise_squashing_compression_key, &stream);
|
|
||||||
|
|
||||||
_ = black_box(compressed);
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream);
|
|
||||||
|
|
||||||
bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_unpack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
let unpacked: CudaSquashedNoiseRadixCiphertext =
|
|
||||||
compressed.get(0, &stream).unwrap().unwrap();
|
|
||||||
|
|
||||||
_ = black_box(unpacked);
|
|
||||||
})
|
|
||||||
});
|
|
||||||
}
|
|
||||||
BenchmarkType::Throughput => {
|
|
||||||
let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0))
|
|
||||||
.ceil() as usize;
|
|
||||||
let elements = 100;
|
|
||||||
bench_group.throughput(Throughput::Elements(elements));
|
|
||||||
|
|
||||||
// Encrypt
|
|
||||||
let local_streams = cuda_local_streams(num_block, elements as usize);
|
|
||||||
|
|
||||||
let cuda_compression_key_vec = local_streams
|
|
||||||
.iter()
|
|
||||||
.map(|local_stream| {
|
|
||||||
compressed_noise_squashing_compression_key
|
|
||||||
.decompress_to_cuda(local_stream)
|
|
||||||
})
|
|
||||||
.collect_vec();
|
|
||||||
|
|
||||||
let cuda_noise_squashing_compression_key =
|
|
||||||
CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
|
|
||||||
&noise_squashing_compression_key,
|
|
||||||
&stream,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Benchmark
|
|
||||||
let builders = (0..elements)
|
|
||||||
.map(|i| {
|
|
||||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
|
||||||
let local_stream = &local_streams[i as usize % local_streams.len()];
|
|
||||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
|
|
||||||
&ct,
|
|
||||||
local_stream,
|
|
||||||
);
|
|
||||||
let cuda_noise_squashing_key =
|
|
||||||
&cuda_compression_key_vec[(i as usize) % local_streams.len()];
|
|
||||||
let d_ns_ct = cuda_noise_squashing_key
|
|
||||||
.squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
|
|
||||||
.unwrap();
|
|
||||||
let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
|
|
||||||
builder.push(d_ns_ct, local_stream);
|
|
||||||
|
|
||||||
builder
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_pack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
builders.par_iter().enumerate().for_each(|(i, builder)| {
|
|
||||||
let local_stream = &local_streams[i % local_streams.len()];
|
|
||||||
|
|
||||||
builder.build(&cuda_noise_squashing_compression_key, local_stream);
|
|
||||||
})
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
let compressed = builders
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, builder)| {
|
|
||||||
let local_stream = &local_streams[i % local_streams.len()];
|
|
||||||
|
|
||||||
builder.build(&cuda_noise_squashing_compression_key, local_stream)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
|
|
||||||
bench_group.bench_function(&bench_id_unpack, |b| {
|
|
||||||
b.iter(|| {
|
|
||||||
compressed.par_iter().enumerate().for_each(|(i, comp)| {
|
|
||||||
let local_stream = &local_streams[i % local_streams.len()];
|
|
||||||
|
|
||||||
comp.get::<CudaSquashedNoiseRadixCiphertext>(0, local_stream)
|
|
||||||
.unwrap()
|
|
||||||
.unwrap();
|
|
||||||
})
|
|
||||||
})
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
write_to_json::<u64, _>(
|
|
||||||
&bench_id_pack,
|
|
||||||
(noise_squashing_compression_parameters, param.into()),
|
|
||||||
noise_squashing_compression_parameters.name(),
|
|
||||||
"pack",
|
|
||||||
&OperatorType::Atomic,
|
|
||||||
bit_size as u32,
|
|
||||||
vec![param.message_modulus.0.ilog2(); num_blocks],
|
|
||||||
);
|
|
||||||
|
|
||||||
write_to_json::<u64, _>(
|
|
||||||
&bench_id_unpack,
|
|
||||||
(noise_squashing_compression_parameters, param.into()),
|
|
||||||
noise_squashing_compression_parameters.name(),
|
|
||||||
"unpack",
|
|
||||||
&OperatorType::Atomic,
|
|
||||||
bit_size as u32,
|
|
||||||
vec![param.message_modulus.0.ilog2(); num_blocks],
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bench_group.finish()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(gpu_glwe_packing2, gpu_glwe_packing);
|
criterion_group!(gpu_glwe_packing2, gpu_glwe_packing);
|
||||||
criterion_group!(gpu_glwe_packing_128_2, gpu_glwe_packing_128);
|
criterion_group!(gpu_glwe_unpacking2, gpu_glwe_unpacking);
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
|
criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
|
||||||
|
|
||||||
#[cfg(feature = "gpu")]
|
#[cfg(feature = "gpu")]
|
||||||
use cuda::{gpu_glwe_packing2, gpu_glwe_packing_128_2};
|
use cuda::gpu_glwe_packing2;
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
use cuda::gpu_glwe_unpacking2;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
#[cfg(feature = "gpu")]
|
#[cfg(feature = "gpu")]
|
||||||
gpu_glwe_packing2();
|
{
|
||||||
#[cfg(feature = "gpu")]
|
gpu_glwe_packing2();
|
||||||
gpu_glwe_packing_128_2();
|
gpu_glwe_unpacking2();
|
||||||
|
}
|
||||||
#[cfg(not(feature = "gpu"))]
|
#[cfg(not(feature = "gpu"))]
|
||||||
cpu_glwe_packing2();
|
cpu_glwe_packing2();
|
||||||
|
|
||||||
|
|||||||
444
tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs
Normal file
444
tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs
Normal file
@@ -0,0 +1,444 @@
|
|||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
mod cuda {
|
||||||
|
use benchmark::params_aliases::*;
|
||||||
|
use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
|
||||||
|
use benchmark::utilities::{
|
||||||
|
cuda_local_keys, get_bench_type, write_to_json, BenchmarkType, OperatorType,
|
||||||
|
};
|
||||||
|
use criterion::{black_box, criterion_group, Criterion, Throughput};
|
||||||
|
use rayon::prelude::*;
|
||||||
|
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
|
||||||
|
use tfhe::integer::ciphertext::{
|
||||||
|
NoiseSquashingCompressionKey, NoiseSquashingCompressionPrivateKey,
|
||||||
|
};
|
||||||
|
use tfhe::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
|
||||||
|
use tfhe::integer::gpu::ciphertext::{
|
||||||
|
CudaCompressedSquashedNoiseCiphertextList, CudaUnsignedRadixCiphertext,
|
||||||
|
};
|
||||||
|
use tfhe::integer::gpu::gen_keys_radix_gpu;
|
||||||
|
use tfhe::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
|
||||||
|
use tfhe::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
|
||||||
|
use tfhe::integer::noise_squashing::{CompressedNoiseSquashingKey, NoiseSquashingPrivateKey};
|
||||||
|
use tfhe::integer::ClientKey;
|
||||||
|
use tfhe::keycache::NamedParam;
|
||||||
|
use tfhe::shortint::parameters::NoiseSquashingCompressionParameters;
|
||||||
|
use tfhe::shortint::PBSParameters;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct BenchConfig {
|
||||||
|
param: PBSParameters,
|
||||||
|
noise_squashing_compression_parameters: NoiseSquashingCompressionParameters,
|
||||||
|
noise_squashing_compression_key: NoiseSquashingCompressionKey,
|
||||||
|
compressed_noise_squashing_compression_key: CompressedNoiseSquashingKey,
|
||||||
|
bit_size: usize,
|
||||||
|
cks: ClientKey,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_num_elements_per_gpu(_bit_size: usize) -> usize {
|
||||||
|
// 200 elements per GPU seems enough to saturate H100s
|
||||||
|
// This is an empirical value and might need to be adjusted in the future
|
||||||
|
200
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_gpu_glwe_packing_128(c: &mut Criterion, config: BenchConfig) {
|
||||||
|
let bench_name = "integer::cuda::128b_packing_compression";
|
||||||
|
let mut bench_group = c.benchmark_group(bench_name);
|
||||||
|
bench_group
|
||||||
|
.sample_size(15)
|
||||||
|
.measurement_time(std::time::Duration::from_secs(30));
|
||||||
|
|
||||||
|
let stream = CudaStreams::new_multi_gpu();
|
||||||
|
|
||||||
|
let BenchConfig {
|
||||||
|
param,
|
||||||
|
noise_squashing_compression_parameters,
|
||||||
|
noise_squashing_compression_key,
|
||||||
|
compressed_noise_squashing_compression_key,
|
||||||
|
bit_size,
|
||||||
|
cks,
|
||||||
|
} = config;
|
||||||
|
|
||||||
|
let log_message_modulus = param.message_modulus().0.ilog2() as usize;
|
||||||
|
|
||||||
|
assert_eq!(bit_size % log_message_modulus, 0);
|
||||||
|
let num_blocks = bit_size / log_message_modulus;
|
||||||
|
|
||||||
|
let bench_id_pack;
|
||||||
|
|
||||||
|
match get_bench_type() {
|
||||||
|
BenchmarkType::Latency => {
|
||||||
|
let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||||
|
let cuda_noise_squashing_key =
|
||||||
|
compressed_noise_squashing_compression_key.decompress_to_cuda(&stream);
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||||
|
let d_ns_ct = cuda_noise_squashing_key
|
||||||
|
.squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
|
||||||
|
.unwrap();
|
||||||
|
let cuda_noise_squashing_compression_key =
|
||||||
|
CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
|
||||||
|
&noise_squashing_compression_key,
|
||||||
|
&stream,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
|
||||||
|
|
||||||
|
builder.push(d_ns_ct, &stream);
|
||||||
|
|
||||||
|
bench_id_pack = format!("{bench_name}::pack_u{bit_size}");
|
||||||
|
bench_group.bench_function(&bench_id_pack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
let compressed =
|
||||||
|
builder.build(&cuda_noise_squashing_compression_key, &stream);
|
||||||
|
|
||||||
|
_ = black_box(compressed);
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
BenchmarkType::Throughput => {
|
||||||
|
let cuda_sks = cuda_local_keys(&cks);
|
||||||
|
let num_block =
|
||||||
|
(bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
|
||||||
|
let elements = get_num_elements_per_gpu(bit_size) as u64;
|
||||||
|
bench_group.throughput(Throughput::Elements(elements));
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let local_streams = cuda_local_streams(num_block, elements as usize);
|
||||||
|
|
||||||
|
let num_gpus = get_number_of_gpus() as usize;
|
||||||
|
|
||||||
|
let cuda_compression_key_vec: Vec<CudaNoiseSquashingKey> = (0..num_gpus)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
compressed_noise_squashing_compression_key.decompress_to_cuda(local_stream)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let cuda_noise_squashing_compression_key_vec: Vec<
|
||||||
|
CudaNoiseSquashingCompressionKey,
|
||||||
|
> = (0..num_gpus)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
|
||||||
|
&noise_squashing_compression_key,
|
||||||
|
local_stream,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let builders = (0..elements)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let local_stream = &local_streams[i as usize % local_streams.len()];
|
||||||
|
let d_ct =
|
||||||
|
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
|
||||||
|
let cuda_noise_squashing_key =
|
||||||
|
&cuda_compression_key_vec[(i as usize) % num_gpus];
|
||||||
|
let cuda_noise_squashing_compression_key =
|
||||||
|
&cuda_noise_squashing_compression_key_vec[(i as usize) % num_gpus];
|
||||||
|
let d_ns_ct = cuda_noise_squashing_key
|
||||||
|
.squash_radix_ciphertext_noise(
|
||||||
|
&cuda_sks[(i as usize) % num_gpus],
|
||||||
|
&d_ct.ciphertext,
|
||||||
|
local_stream,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
|
||||||
|
builder.push(d_ns_ct, local_stream);
|
||||||
|
|
||||||
|
(builder, cuda_noise_squashing_compression_key, local_stream)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
|
||||||
|
bench_group.bench_function(&bench_id_pack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
builders.par_iter().for_each(
|
||||||
|
|(builder, cuda_noise_squashing_compression_key, local_stream)| {
|
||||||
|
builder.build(cuda_noise_squashing_compression_key, local_stream);
|
||||||
|
},
|
||||||
|
)
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write_to_json::<u64, _>(
|
||||||
|
&bench_id_pack,
|
||||||
|
(noise_squashing_compression_parameters, param.into()),
|
||||||
|
noise_squashing_compression_parameters.name(),
|
||||||
|
"pack",
|
||||||
|
&OperatorType::Atomic,
|
||||||
|
bit_size as u32,
|
||||||
|
vec![param.message_modulus().0.ilog2(); num_blocks],
|
||||||
|
);
|
||||||
|
|
||||||
|
bench_group.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_gpu_glwe_unpacking_128(c: &mut Criterion, config: BenchConfig) {
|
||||||
|
let bench_name = "integer::cuda::128b_packing_compression";
|
||||||
|
let mut bench_group = c.benchmark_group(bench_name);
|
||||||
|
bench_group
|
||||||
|
.sample_size(15)
|
||||||
|
.measurement_time(std::time::Duration::from_secs(30));
|
||||||
|
|
||||||
|
let stream = CudaStreams::new_multi_gpu();
|
||||||
|
|
||||||
|
let BenchConfig {
|
||||||
|
param,
|
||||||
|
noise_squashing_compression_parameters,
|
||||||
|
noise_squashing_compression_key,
|
||||||
|
compressed_noise_squashing_compression_key,
|
||||||
|
bit_size,
|
||||||
|
cks,
|
||||||
|
} = config;
|
||||||
|
|
||||||
|
let log_message_modulus = param.message_modulus().0.ilog2() as usize;
|
||||||
|
|
||||||
|
assert_eq!(bit_size % log_message_modulus, 0);
|
||||||
|
let num_blocks = bit_size / log_message_modulus;
|
||||||
|
|
||||||
|
let bench_id_unpack;
|
||||||
|
|
||||||
|
match get_bench_type() {
|
||||||
|
BenchmarkType::Latency => {
|
||||||
|
let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||||
|
let cuda_noise_squashing_key =
|
||||||
|
compressed_noise_squashing_compression_key.decompress_to_cuda(&stream);
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||||
|
let d_ns_ct = cuda_noise_squashing_key
|
||||||
|
.squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream)
|
||||||
|
.unwrap();
|
||||||
|
let cuda_noise_squashing_compression_key =
|
||||||
|
CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
|
||||||
|
&noise_squashing_compression_key,
|
||||||
|
&stream,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
|
||||||
|
|
||||||
|
builder.push(d_ns_ct, &stream);
|
||||||
|
|
||||||
|
let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream);
|
||||||
|
|
||||||
|
bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}");
|
||||||
|
bench_group.bench_function(&bench_id_unpack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
let unpacked: CudaSquashedNoiseRadixCiphertext =
|
||||||
|
compressed.get(0, &stream).unwrap().unwrap();
|
||||||
|
|
||||||
|
_ = black_box(unpacked);
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
BenchmarkType::Throughput => {
|
||||||
|
let cuda_sks = cuda_local_keys(&cks);
|
||||||
|
let num_block =
|
||||||
|
(bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize;
|
||||||
|
let elements = get_num_elements_per_gpu(bit_size) as u64;
|
||||||
|
bench_group.throughput(Throughput::Elements(elements));
|
||||||
|
|
||||||
|
// Encrypt
|
||||||
|
let local_streams = cuda_local_streams(num_block, elements as usize);
|
||||||
|
|
||||||
|
let num_gpus = get_number_of_gpus() as usize;
|
||||||
|
|
||||||
|
let cuda_compression_key_vec: Vec<CudaNoiseSquashingKey> = (0..num_gpus)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
compressed_noise_squashing_compression_key.decompress_to_cuda(local_stream)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let cuda_noise_squashing_compression_key_vec: Vec<
|
||||||
|
CudaNoiseSquashingCompressionKey,
|
||||||
|
> = (0..num_gpus)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
|
||||||
|
&noise_squashing_compression_key,
|
||||||
|
local_stream,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Benchmark
|
||||||
|
let builders = (0..elements)
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|i| {
|
||||||
|
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||||
|
let local_stream = &local_streams[i as usize % local_streams.len()];
|
||||||
|
let d_ct =
|
||||||
|
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
|
||||||
|
let cuda_noise_squashing_key =
|
||||||
|
&cuda_compression_key_vec[(i as usize) % num_gpus];
|
||||||
|
let cuda_noise_squashing_compression_key =
|
||||||
|
&cuda_noise_squashing_compression_key_vec[(i as usize) % num_gpus];
|
||||||
|
let d_ns_ct = cuda_noise_squashing_key
|
||||||
|
.squash_radix_ciphertext_noise(
|
||||||
|
&cuda_sks[(i as usize) % num_gpus],
|
||||||
|
&d_ct.ciphertext,
|
||||||
|
local_stream,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder();
|
||||||
|
builder.push(d_ns_ct, local_stream);
|
||||||
|
|
||||||
|
(builder, cuda_noise_squashing_compression_key, local_stream)
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let compressed = builders
|
||||||
|
.into_par_iter()
|
||||||
|
.map(
|
||||||
|
|(builder, cuda_noise_squashing_compression_key, local_stream)| {
|
||||||
|
builder.build(cuda_noise_squashing_compression_key, local_stream)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
|
||||||
|
bench_group.bench_function(&bench_id_unpack, |b| {
|
||||||
|
b.iter(|| {
|
||||||
|
compressed.par_iter().enumerate().for_each(|(i, comp)| {
|
||||||
|
let local_stream = &local_streams[i % local_streams.len()];
|
||||||
|
|
||||||
|
comp.get::<CudaSquashedNoiseRadixCiphertext>(0, local_stream)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
})
|
||||||
|
})
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
write_to_json::<u64, _>(
|
||||||
|
&bench_id_unpack,
|
||||||
|
(noise_squashing_compression_parameters, param.into()),
|
||||||
|
noise_squashing_compression_parameters.name(),
|
||||||
|
"unpack",
|
||||||
|
&OperatorType::Atomic,
|
||||||
|
bit_size as u32,
|
||||||
|
vec![param.message_modulus().0.ilog2(); num_blocks],
|
||||||
|
);
|
||||||
|
|
||||||
|
bench_group.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gpu_glwe_packing_128(c: &mut Criterion) {
|
||||||
|
let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
|
let noise_squashing_compression_parameters =
|
||||||
|
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
|
let noise_squashing_parameters =
|
||||||
|
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
|
|
||||||
|
let log_message_modulus = param.message_modulus.0.ilog2() as usize;
|
||||||
|
|
||||||
|
let cks = ClientKey::new(param);
|
||||||
|
|
||||||
|
let noise_squashing_compression_private_key =
|
||||||
|
NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters);
|
||||||
|
let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters);
|
||||||
|
let noise_squashing_compression_key = noise_squashing_private_key
|
||||||
|
.new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
|
||||||
|
|
||||||
|
// Generate and convert compression keys
|
||||||
|
let compressed_noise_squashing_compression_key =
|
||||||
|
cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
|
||||||
|
|
||||||
|
let mut config = BenchConfig {
|
||||||
|
param: PBSParameters::PBS(param),
|
||||||
|
noise_squashing_compression_key,
|
||||||
|
noise_squashing_compression_parameters,
|
||||||
|
compressed_noise_squashing_compression_key,
|
||||||
|
bit_size: 0,
|
||||||
|
cks,
|
||||||
|
};
|
||||||
|
for bit_size in [
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus,
|
||||||
|
] {
|
||||||
|
config.bit_size = bit_size;
|
||||||
|
execute_gpu_glwe_packing_128(c, config.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn gpu_glwe_unpacking_128(c: &mut Criterion) {
|
||||||
|
let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
|
let noise_squashing_compression_parameters =
|
||||||
|
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
|
let noise_squashing_parameters =
|
||||||
|
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||||
|
|
||||||
|
let log_message_modulus = param.message_modulus.0.ilog2() as usize;
|
||||||
|
|
||||||
|
let cks = ClientKey::new(param);
|
||||||
|
|
||||||
|
let noise_squashing_compression_private_key =
|
||||||
|
NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters);
|
||||||
|
let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters);
|
||||||
|
let noise_squashing_compression_key = noise_squashing_private_key
|
||||||
|
.new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
|
||||||
|
|
||||||
|
// Generate and convert compression keys
|
||||||
|
let compressed_noise_squashing_compression_key =
|
||||||
|
cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
|
||||||
|
|
||||||
|
let mut config = BenchConfig {
|
||||||
|
param: PBSParameters::PBS(param),
|
||||||
|
noise_squashing_compression_key,
|
||||||
|
noise_squashing_compression_parameters,
|
||||||
|
compressed_noise_squashing_compression_key,
|
||||||
|
bit_size: 0,
|
||||||
|
cks,
|
||||||
|
};
|
||||||
|
for bit_size in [
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus,
|
||||||
|
] {
|
||||||
|
config.bit_size = bit_size;
|
||||||
|
execute_gpu_glwe_unpacking_128(c, config.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
criterion_group!(gpu_glwe_packing_128_2, gpu_glwe_packing_128);
|
||||||
|
criterion_group!(gpu_glwe_unpacking_128_2, gpu_glwe_unpacking_128);
|
||||||
|
}
|
||||||
|
|
||||||
|
use criterion::Criterion;
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
use cuda::gpu_glwe_packing_128_2;
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
use cuda::gpu_glwe_unpacking_128_2;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
gpu_glwe_packing_128_2();
|
||||||
|
#[cfg(feature = "gpu")]
|
||||||
|
gpu_glwe_unpacking_128_2();
|
||||||
|
Criterion::default().configure_from_args().final_summary();
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user