feat(gpu): implement re-randomization

- exposed to integer and HL API
- test on the HL API
- benchmarks for GPU and CPU implementation
This commit is contained in:
Pedro Alves
2025-10-24 12:46:14 -03:00
committed by Pedro Alves
parent 3c32b15d02
commit 867f8fb579
24 changed files with 1258 additions and 80 deletions

View File

@@ -96,6 +96,12 @@ path = "benches/integer/glwe_packing_compression.rs"
harness = false
required-features = ["integer", "pbs-stats", "internal-keycache"]
[[bench]]
name = "integer-rerand"
path = "benches/integer/rerand.rs"
harness = false
required-features = ["integer", "pbs-stats", "internal-keycache"]
[[bench]]
name = "glwe_packing_compression_128b-integer-bench"
path = "benches/integer/glwe_packing_compression_128b.rs"

View File

@@ -3,6 +3,8 @@
mod aes;
mod oprf;
mod rerand;
use benchmark::params::ParamsAndNumBlocksIter;
use benchmark::utilities::{
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, EnvConfig, OperatorType,

View File

@@ -0,0 +1,417 @@
use benchmark::params_aliases::{
BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
};
use benchmark::utilities::{
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
};
use criterion::{black_box, criterion_group, BatchSize, Criterion, Throughput};
#[cfg(feature = "gpu")]
use cuda::gpu_re_randomize_group;
use rayon::iter::{IndexedParallelIterator, ParallelIterator};
use rayon::prelude::{IntoParallelIterator, IntoParallelRefMutIterator};
use tfhe::integer::ciphertext::{CompressedCiphertextListBuilder, ReRandomizationContext};
use tfhe::integer::key_switching_key::{KeySwitchingKey, KeySwitchingKeyMaterial};
use tfhe::integer::{gen_keys_radix, CompactPrivateKey, CompactPublicKey, RadixCiphertext};
use tfhe::keycache::NamedParam;
fn execute_cpu_re_randomize(c: &mut Criterion, bit_size: usize) {
let bench_name = "integer::re_randomize";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let comp_param = BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let cpk_param = BENCH_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1;
let ks_param = BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let num_blocks = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
let (radix_cks, sks) = gen_keys_radix(param, num_blocks);
let cks = radix_cks.as_ref();
let private_compression_key = cks.new_compression_private_key(comp_param);
let (compressed_compression_key, compressed_decompression_key) =
radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
let compression_key = compressed_compression_key.decompress();
let decompression_key = compressed_decompression_key.decompress();
let cpk_private_key = CompactPrivateKey::new(cpk_param);
let cpk = CompactPublicKey::new(&cpk_private_key);
let ksk = KeySwitchingKey::new((&cpk_private_key, None), ((&cks), (&sks)), ks_param);
let ksk = ksk.into_raw_parts();
let (ksk_material, _, _) = ksk.into_raw_parts();
let ksk_material = KeySwitchingKeyMaterial::from_raw_parts(ksk_material);
let rerand_domain_separator = *b"TFHE_Rrd";
let compact_public_encryption_domain_separator = *b"TFHE_Enc";
let metadata = b"bench".as_slice();
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
// Encrypt and compress a single ciphertext
let message = 42u64;
let ct = cks.encrypt_radix(message, num_blocks);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(ct);
let compressed = builder.build(&compression_key);
let decompressed: RadixCiphertext =
compressed.get(0, &decompression_key).unwrap().unwrap();
let mut d_re_randomized = decompressed.clone();
bench_id = format!("{bench_name}::latency_u{bit_size}");
bench_group.bench_function(&bench_id, |b| {
b.iter_batched(
|| {
let mut re_randomizer_context = ReRandomizationContext::new(
rerand_domain_separator,
[metadata],
compact_public_encryption_domain_separator,
);
re_randomizer_context.add_ciphertext(&decompressed);
re_randomizer_context.finalize()
},
|mut seed_gen| {
d_re_randomized
.re_randomize(
&cpk,
&ksk_material.as_view(),
seed_gen.next_seed().unwrap(),
)
.unwrap();
_ = black_box(&d_re_randomized);
},
BatchSize::SmallInput,
)
});
}
BenchmarkType::Throughput => {
let elements = throughput_num_threads(num_blocks, 1);
bench_group.throughput(Throughput::Elements(elements));
// Pre-generate and compress ciphertexts for throughput test
let decompressed_cts: Vec<RadixCiphertext> = (0..elements as usize)
.into_par_iter()
.map(|_| {
let message = 42u64;
let ct = cks.encrypt_radix(message, num_blocks);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(ct);
let compressed = builder.build(&compression_key);
compressed.get(0, &decompression_key).unwrap().unwrap()
})
.collect();
bench_id = format!("{bench_name}::throughput_u{bit_size}");
bench_group.bench_function(&bench_id, |b| {
b.iter_batched(
|| {
// Create a fresh context for each benchmark iteration
let mut ctx = ReRandomizationContext::new(
rerand_domain_separator,
[metadata],
compact_public_encryption_domain_separator,
);
// Add all ciphertexts to the context
for ct in &decompressed_cts {
ctx.add_ciphertext(ct);
}
// Return a new seed generator for this iteration
(ctx.finalize(), decompressed_cts.clone())
},
|(mut seed_gen, mut cts_to_rerand)| {
let seeds: Vec<_> = (0..cts_to_rerand.len())
.map(|_| seed_gen.next_seed().unwrap())
.collect();
cts_to_rerand
.par_iter_mut()
.zip(seeds.into_par_iter())
.for_each(|(d_re_randomized, seed)| {
d_re_randomized
.re_randomize(&cpk, &ksk_material.as_view(), seed)
.unwrap();
_ = black_box(&d_re_randomized);
})
},
BatchSize::SmallInput,
)
});
}
}
write_to_json::<u64, _>(
&bench_id,
(comp_param, param.into()),
comp_param.name(),
"re_randomize",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus.0.ilog2(); num_blocks],
);
bench_group.finish()
}
fn cpu_re_randomize(c: &mut Criterion) {
let bit_sizes = [2, 4, 8, 16, 32, 64, 128, 256];
for bit_size in bit_sizes.iter() {
execute_cpu_re_randomize(c, *bit_size);
}
}
criterion_group!(cpu_re_randomize_group, cpu_re_randomize);
#[cfg(feature = "gpu")]
mod cuda {
use benchmark::params_aliases::{
BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
};
use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
use benchmark::utilities::{
get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
};
use criterion::{black_box, criterion_group, BatchSize, Criterion, Throughput};
use rayon::prelude::*;
use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
use tfhe::integer::ciphertext::ReRandomizationContext;
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
use tfhe::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use tfhe::integer::key_switching_key::KeySwitchingKey;
use tfhe::integer::{gen_keys_radix, CompactPrivateKey, CompactPublicKey};
use tfhe::keycache::NamedParam;
use tfhe::shortint::key_switching_key::CudaKeySwitchingKeyMaterial;
fn execute_gpu_re_randomize(c: &mut Criterion, bit_size: usize) {
let bench_name = "integer::cuda::re_randomize";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128;
let comp_param = BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let cpk_param = BENCH_PARAM_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1;
let ks_param = BENCH_PARAM_KEYSWITCH_PKE_TO_BIG_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
let streams = CudaStreams::new_multi_gpu();
let num_blocks =
(bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)).ceil() as usize;
let (radix_cks, sks) = gen_keys_radix(param, num_blocks);
let cks = radix_cks.as_ref();
let private_compression_key = cks.new_compression_private_key(comp_param);
let (cuda_compression_key, cuda_decompression_key) =
radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);
let cpk_private_key = CompactPrivateKey::new(cpk_param);
let cpk = CompactPublicKey::new(&cpk_private_key);
let ksk = KeySwitchingKey::new((&cpk_private_key, None), (&cks, &sks), ks_param);
let d_ksk_material = CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
let rerand_domain_separator = *b"TFHE_Rrd";
let compact_public_encryption_domain_separator = *b"TFHE_Enc";
let metadata = b"bench".as_slice();
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
// Encrypt and compress a single ciphertext
let message = 42u64;
let ct = cks.encrypt_radix(message, num_blocks);
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &streams);
let mut builder = CudaCompressedCiphertextListBuilder::new();
builder.push(d_ct, &streams);
let compressed = builder.build(&cuda_compression_key, &streams);
let d_decompressed: CudaUnsignedRadixCiphertext = compressed
.get(0, &cuda_decompression_key, &streams)
.unwrap()
.unwrap();
let decompressed = d_decompressed.to_radix_ciphertext(&streams);
let mut d_re_randomized = d_decompressed.duplicate(&streams);
bench_id = format!("{bench_name}::latency_u{bit_size}");
bench_group.bench_function(&bench_id, |b| {
b.iter_batched(
|| {
let mut re_randomizer_context = ReRandomizationContext::new(
rerand_domain_separator,
[metadata],
compact_public_encryption_domain_separator,
);
re_randomizer_context.add_ciphertext(&decompressed);
re_randomizer_context.finalize()
},
|mut seed_gen| {
d_re_randomized
.re_randomize(
&cpk,
&d_ksk_material,
seed_gen.next_seed().unwrap(),
&streams,
)
.unwrap();
_ = black_box(&d_re_randomized);
},
BatchSize::SmallInput,
)
});
}
BenchmarkType::Throughput => {
let elements = throughput_num_threads(num_blocks, 1);
bench_group.throughput(Throughput::Elements(elements));
let local_streams = cuda_local_streams(num_blocks, elements as usize);
let num_gpus = get_number_of_gpus() as usize;
let d_ksk_material_vec: Vec<CudaKeySwitchingKeyMaterial> = (0..num_gpus)
.map(|i| {
let local_stream = &local_streams[i % local_streams.len()];
CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, local_stream)
})
.collect();
// Pre-generate and compress ciphertexts for throughput test
let d_compressed_cts: Vec<CudaUnsignedRadixCiphertext> = (0..elements as usize)
.into_par_iter()
.map(|i| {
let message = 42u64;
let ct = cks.encrypt_radix(message, num_blocks);
let local_stream = &local_streams[i % local_streams.len()];
let d_ct =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream);
let mut builder = CudaCompressedCiphertextListBuilder::new();
builder.push(d_ct, local_stream);
let compressed = builder.build(&cuda_compression_key, local_stream);
compressed
.get(0, &cuda_decompression_key, local_stream)
.unwrap()
.unwrap()
})
.collect();
// Prepare decompressed ciphertexts once
let h_decompressed_cts: Vec<_> = d_compressed_cts
.iter()
.enumerate()
.map(|(i, d_ct)| {
let local_stream = &local_streams[i % local_streams.len()];
d_ct.to_radix_ciphertext(local_stream)
})
.collect();
bench_id = format!("{bench_name}::throughput_u{bit_size}");
bench_group.bench_function(&bench_id, |b| {
b.iter_batched(
|| {
// Create a fresh context for each benchmark iteration
let mut ctx = ReRandomizationContext::new(
rerand_domain_separator,
[metadata],
compact_public_encryption_domain_separator,
);
// Add all ciphertexts to the context
for ct in &h_decompressed_cts {
ctx.add_ciphertext(ct);
}
let d_cts_to_rerand = d_compressed_cts
.iter()
.enumerate()
.map(|(i, d_ct)| {
let local_stream = &local_streams[i % local_streams.len()];
d_ct.duplicate(local_stream)
})
.collect::<Vec<_>>();
// Return a new seed generator for this iteration
(ctx.finalize(), h_decompressed_cts.clone(), d_cts_to_rerand)
},
|(mut seed_gen, h_cts_to_rerand, mut d_cts_to_rerand)| {
let seeds: Vec<_> = (0..h_cts_to_rerand.len())
.map(|_| seed_gen.next_seed().unwrap())
.collect();
d_cts_to_rerand
.par_iter_mut()
.zip(seeds.into_par_iter())
.enumerate()
.for_each(|(i, (d_re_randomized, seed))| {
let local_stream = &local_streams[i % local_streams.len()];
let d_ksk = &d_ksk_material_vec[i % num_gpus];
d_re_randomized
.re_randomize(&cpk, d_ksk, seed, local_stream)
.unwrap();
_ = black_box(&d_re_randomized);
})
},
BatchSize::SmallInput,
)
});
}
}
write_to_json::<u64, _>(
&bench_id,
(comp_param, param.into()),
comp_param.name(),
"re_randomize",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus.0.ilog2(); num_blocks],
);
bench_group.finish()
}
fn gpu_re_randomize(c: &mut Criterion) {
let bit_sizes = [2, 4, 16, 32, 64, 128, 256];
for bit_size in bit_sizes.iter() {
execute_gpu_re_randomize(c, *bit_size);
}
}
criterion_group!(gpu_re_randomize_group, gpu_re_randomize);
}
fn main() {
#[cfg(feature = "gpu")]
gpu_re_randomize_group();
#[cfg(not(feature = "gpu"))]
cpu_re_randomize_group();
Criterion::default().configure_from_args().final_summary();
}