diff --git a/Makefile b/Makefile index 2ea875d91..ca3470247 100644 --- a/Makefile +++ b/Makefile @@ -1390,6 +1390,13 @@ bench_integer_compression_gpu: install_rs_check_toolchain --bench integer-glwe_packing_compression \ --features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark -- +.PHONY: bench_integer_compression_128b_gpu +bench_integer_compression_128b_gpu: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ + cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ + --bench glwe_packing_compression_128b-integer-bench \ + --features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark -- + .PHONY: bench_integer_zk_gpu bench_integer_zk_gpu: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ diff --git a/tfhe-benchmark/Cargo.toml b/tfhe-benchmark/Cargo.toml index b57944706..5a76bce9b 100644 --- a/tfhe-benchmark/Cargo.toml +++ b/tfhe-benchmark/Cargo.toml @@ -96,6 +96,12 @@ path = "benches/integer/glwe_packing_compression.rs" harness = false required-features = ["integer", "pbs-stats", "internal-keycache"] +[[bench]] +name = "glwe_packing_compression_128b-integer-bench" +path = "benches/integer/glwe_packing_compression_128b.rs" +harness = false +required-features = ["integer", "pbs-stats", "internal-keycache"] + [[bench]] name = "integer" path = "benches/integer/bench.rs" diff --git a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs index 41b7b420a..199ef2886 100644 --- a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs +++ b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs @@ -159,20 +159,30 @@ fn cpu_glwe_packing(c: &mut Criterion) { mod cuda { use super::*; use benchmark::utilities::cuda_integer_utils::cuda_local_streams; - use itertools::Itertools; - use std::cmp::max; - use tfhe::core_crypto::gpu::CudaStreams; - use tfhe::integer::ciphertext::NoiseSquashingCompressionPrivateKey; + use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams}; + use tfhe::integer::compression_keys::CompressionPrivateKeys; use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder; - use tfhe::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext; - use tfhe::integer::gpu::ciphertext::{ - CudaCompressedSquashedNoiseCiphertextList, CudaUnsignedRadixCiphertext, - }; + use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; use tfhe::integer::gpu::gen_keys_radix_gpu; - use tfhe::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey; - use tfhe::integer::noise_squashing::NoiseSquashingPrivateKey; + use tfhe::shortint::parameters::CompressionParameters; + use tfhe::shortint::PBSParameters; - fn gpu_glwe_packing(c: &mut Criterion) { + #[derive(Clone)] + struct BenchConfig { + param: PBSParameters, + comp_param: CompressionParameters, + bit_size: usize, + cks: ClientKey, + private_compression_key: CompressionPrivateKeys, + } + + fn get_num_elements_per_gpu(_bit_size: usize) -> usize { + // 200 elements per GPU seems enough to saturate H100s + // This is an empirical value and might need to be adjusted in the future + 200 + } + + fn execute_gpu_glwe_packing(c: &mut Criterion, config: BenchConfig) { let bench_name = "integer::cuda::packing_compression"; let mut bench_group = c.benchmark_group(bench_name); bench_group @@ -181,6 +191,279 @@ mod cuda { let stream = CudaStreams::new_multi_gpu(); + let BenchConfig { + param, + comp_param, + bit_size, + cks, + private_compression_key, + } = config; + + let log_message_modulus = param.message_modulus().0.ilog2() as usize; + + assert_eq!(bit_size % log_message_modulus, 0); + let num_blocks = bit_size / log_message_modulus; + + let bench_id_pack; + + match get_bench_type() { + BenchmarkType::Latency => { + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, _) = radix_cks + .new_compressed_compression_decompression_keys(&private_compression_key); + + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + + // Encrypt + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + + // Benchmark + let mut builder = CudaCompressedCiphertextListBuilder::new(); + + builder.push(d_ct, &stream); + + bench_id_pack = format!("{bench_name}::pack_u{bit_size}"); + bench_group.bench_function(&bench_id_pack, |b| { + b.iter(|| { + let compressed = builder.build(&cuda_compression_key, &stream); + + _ = black_box(compressed); + }) + }); + } + BenchmarkType::Throughput => { + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, _) = radix_cks + .new_compressed_compression_decompression_keys(&private_compression_key); + + let elements_per_gpu = get_num_elements_per_gpu(bit_size) as u64; + let elements = elements_per_gpu * get_number_of_gpus() as u64; + + let num_block = + (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize; + bench_group.throughput(Throughput::Elements(elements)); + + // Encrypt + let local_streams = cuda_local_streams(num_block, elements as usize); + + bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}"); + let cuda_compression_key_vec = (0..get_number_of_gpus()) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i as usize]; + compressed_compression_key.decompress_to_cuda(local_stream) + }) + .collect::>(); + + // Benchmark + let builders = (0..elements) + .into_par_iter() + .map(|i| { + let ct = cks.encrypt_radix(0_u32, num_blocks); + let local_stream = &local_streams[i as usize % local_streams.len()]; + let d_ct = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream); + let mut builder = CudaCompressedCiphertextListBuilder::new(); + builder.push(d_ct, local_stream); + + builder + }) + .collect::>(); + + bench_group.bench_function(&bench_id_pack, |b| { + b.iter(|| { + builders.par_iter().enumerate().for_each(|(i, builder)| { + let local_stream = &local_streams[i % local_streams.len()]; + let cuda_compression_key = + &cuda_compression_key_vec[i % get_number_of_gpus() as usize]; + + let _ = builder.build(cuda_compression_key, local_stream); + }) + }) + }); + } + } + + write_to_json::( + &bench_id_pack, + (comp_param, param.into()), + comp_param.name(), + "pack", + &OperatorType::Atomic, + bit_size as u32, + vec![param.message_modulus().0.ilog2(); num_blocks], + ); + + bench_group.finish() + } + + fn execute_gpu_glwe_unpacking(c: &mut Criterion, config: BenchConfig) { + let bench_name = "integer::cuda::packing_compression"; + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(30)); + + let stream = CudaStreams::new_multi_gpu(); + + let BenchConfig { + param, + comp_param, + bit_size, + cks, + private_compression_key, + } = config; + + let log_message_modulus = param.message_modulus().0.ilog2() as usize; + + assert_eq!(bit_size % log_message_modulus, 0); + let num_blocks = bit_size / log_message_modulus; + + let bench_id_unpack; + + match get_bench_type() { + BenchmarkType::Latency => { + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, compressed_decompression_key) = radix_cks + .new_compressed_compression_decompression_keys(&private_compression_key); + + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream); + let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( + radix_cks.parameters().glwe_dimension(), + radix_cks.parameters().polynomial_size(), + radix_cks.parameters().message_modulus(), + radix_cks.parameters().carry_modulus(), + radix_cks.parameters().ciphertext_modulus(), + &stream, + ); + + // Encrypt + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + + // Benchmark + let mut builder = CudaCompressedCiphertextListBuilder::new(); + + builder.push(d_ct, &stream); + + let compressed = builder.build(&cuda_compression_key, &stream); + + bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}"); + bench_group.bench_function(&bench_id_unpack, |b| { + b.iter(|| { + let unpacked: CudaUnsignedRadixCiphertext = compressed + .get(0, &cuda_decompression_key, &stream) + .unwrap() + .unwrap(); + + _ = black_box(unpacked); + }) + }); + } + BenchmarkType::Throughput => { + // Generate and convert compression keys + let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); + let (compressed_compression_key, compressed_decompression_key) = radix_cks + .new_compressed_compression_decompression_keys(&private_compression_key); + + let elements_per_gpu = get_num_elements_per_gpu(bit_size) as u64; + let elements = elements_per_gpu * get_number_of_gpus() as u64; + + let num_block = + (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize; + bench_group.throughput(Throughput::Elements(elements)); + + // Encrypt + let local_streams = cuda_local_streams(num_block, elements as usize); + + bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}"); + let builders = (0..elements) + .into_par_iter() + .map(|i| { + let ct = cks.encrypt_radix(0_u32, num_blocks); + let local_stream = &local_streams[i as usize % local_streams.len()]; + let d_ct = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream); + let mut builder = CudaCompressedCiphertextListBuilder::new(); + builder.push(d_ct, local_stream); + + builder + }) + .collect::>(); + + let cuda_compression_key_vec = (0..get_number_of_gpus()) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i as usize]; + compressed_compression_key.decompress_to_cuda(local_stream) + }) + .collect::>(); + + let cuda_decompression_key_vec = (0..get_number_of_gpus()) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i as usize]; + compressed_decompression_key.decompress_to_cuda( + radix_cks.parameters().glwe_dimension(), + radix_cks.parameters().polynomial_size(), + radix_cks.parameters().message_modulus(), + radix_cks.parameters().carry_modulus(), + radix_cks.parameters().ciphertext_modulus(), + local_stream, + ) + }) + .collect::>(); + + let compressed = builders + .par_iter() + .enumerate() + .map(|(i, builder)| { + let local_stream = &local_streams[i % local_streams.len()]; + let cuda_compression_key = + &cuda_compression_key_vec[i % get_number_of_gpus() as usize]; + builder.build(cuda_compression_key, local_stream) + }) + .collect::>(); + + bench_group.bench_function(&bench_id_unpack, |b| { + b.iter(|| { + compressed.par_iter().enumerate().for_each(|(i, comp)| { + let local_stream = &local_streams[i % local_streams.len()]; + let cuda_decompression_key = + &cuda_decompression_key_vec[i % get_number_of_gpus() as usize]; + + let _ = comp + .get::( + 0, + cuda_decompression_key, + local_stream, + ) + .unwrap() + .unwrap(); + }) + }) + }); + } + } + + write_to_json::( + &bench_id_unpack, + (comp_param, param.into()), + comp_param.name(), + "unpack", + &OperatorType::Atomic, + bit_size as u32, + vec![param.message_modulus().0.ilog2(); num_blocks], + ); + + bench_group.finish() + } + + fn gpu_glwe_packing(c: &mut Criterion) { let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; let comp_param = BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; @@ -190,6 +473,13 @@ mod cuda { let cks = ClientKey::new(param); let private_compression_key = cks.new_compression_private_key(comp_param); + let mut config = BenchConfig { + param: tfhe::shortint::PBSParameters::MultiBitPBS(param), + comp_param, + cks, + private_compression_key, + bit_size: 0, + }; for bit_size in [ 2, 8, @@ -200,218 +490,28 @@ mod cuda { 256, comp_param.lwe_per_glwe().0 * log_message_modulus, ] { - assert_eq!(bit_size % log_message_modulus, 0); - let num_blocks = bit_size / log_message_modulus; - - let bench_id_pack; - let bench_id_unpack; - - // Generate and convert compression keys - let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream); - let (compressed_compression_key, compressed_decompression_key) = - radix_cks.new_compressed_compression_decompression_keys(&private_compression_key); - - match get_bench_type() { - BenchmarkType::Latency => { - let cuda_compression_key = - compressed_compression_key.decompress_to_cuda(&stream); - let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - &stream, - ); - - // Encrypt - let ct = cks.encrypt_radix(0_u32, num_blocks); - let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); - - // Benchmark - let mut builder = CudaCompressedCiphertextListBuilder::new(); - - builder.push(d_ct, &stream); - - bench_id_pack = format!("{bench_name}::pack_u{bit_size}"); - bench_group.bench_function(&bench_id_pack, |b| { - b.iter(|| { - let compressed = builder.build(&cuda_compression_key, &stream); - - _ = black_box(compressed); - }) - }); - - let compressed = builder.build(&cuda_compression_key, &stream); - - bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}"); - bench_group.bench_function(&bench_id_unpack, |b| { - b.iter(|| { - let unpacked: CudaUnsignedRadixCiphertext = compressed - .get(0, &cuda_decompression_key, &stream) - .unwrap() - .unwrap(); - - _ = black_box(unpacked); - }) - }); - } - BenchmarkType::Throughput => { - // Execute the operation once to know its cost. - let (cpu_compression_key, cpu_decompression_key) = - cks.new_compression_decompression_keys(&private_compression_key); - let ct = cks.encrypt_radix(0_u32, num_blocks); - let mut builder = CompressedCiphertextListBuilder::new(); - builder.push(ct); - let compressed = builder.build(&cpu_compression_key); - - reset_pbs_count(); - // Use CPU operation as pbs_count do not count PBS on GPU backend. - let _: RadixCiphertext = - compressed.get(0, &cpu_decompression_key).unwrap().unwrap(); - let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default - - let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)) - .ceil() as usize; - let elements = throughput_num_threads(num_block, pbs_count); - bench_group.throughput(Throughput::Elements(elements)); - - // Encrypt - let local_streams = cuda_local_streams(num_block, elements as usize); - - let cuda_compression_key_vec = local_streams - .iter() - .map(|local_stream| { - compressed_compression_key.decompress_to_cuda(local_stream) - }) - .collect_vec(); - let cuda_decompression_key_vec = local_streams - .iter() - .map(|local_stream| { - compressed_decompression_key.decompress_to_cuda( - radix_cks.parameters().glwe_dimension(), - radix_cks.parameters().polynomial_size(), - radix_cks.parameters().message_modulus(), - radix_cks.parameters().carry_modulus(), - radix_cks.parameters().ciphertext_modulus(), - local_stream, - ) - }) - .collect_vec(); - - // Benchmark - let builders = (0..elements) - .map(|i| { - let ct = cks.encrypt_radix(0_u32, num_blocks); - let local_stream = &local_streams[i as usize % local_streams.len()]; - let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext( - &ct, - local_stream, - ); - let mut builder = CudaCompressedCiphertextListBuilder::new(); - builder.push(d_ct, local_stream); - - builder - }) - .collect::>(); - - bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}"); - bench_group.bench_function(&bench_id_pack, |b| { - b.iter(|| { - builders.par_iter().enumerate().for_each(|(i, builder)| { - let local_stream = &local_streams[i % local_streams.len()]; - let cuda_compression_key = - &cuda_compression_key_vec[i % local_streams.len()]; - - builder.build(cuda_compression_key, local_stream); - }) - }) - }); - - let compressed = builders - .iter() - .enumerate() - .map(|(i, builder)| { - let local_stream = &local_streams[i % local_streams.len()]; - let cuda_compression_key = - &cuda_compression_key_vec[i % local_streams.len()]; - builder.build(cuda_compression_key, local_stream) - }) - .collect::>(); - - bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}"); - bench_group.bench_function(&bench_id_unpack, |b| { - b.iter(|| { - compressed.par_iter().enumerate().for_each(|(i, comp)| { - let local_stream = &local_streams[i % local_streams.len()]; - let cuda_decompression_key = - &cuda_decompression_key_vec[i % local_streams.len()]; - - comp.get::( - 0, - cuda_decompression_key, - local_stream, - ) - .unwrap() - .unwrap(); - }) - }) - }); - } - } - - write_to_json::( - &bench_id_pack, - (comp_param, param.into()), - comp_param.name(), - "pack", - &OperatorType::Atomic, - bit_size as u32, - vec![param.message_modulus.0.ilog2(); num_blocks], - ); - - write_to_json::( - &bench_id_unpack, - (comp_param, param.into()), - comp_param.name(), - "unpack", - &OperatorType::Atomic, - bit_size as u32, - vec![param.message_modulus.0.ilog2(); num_blocks], - ); + config.bit_size = bit_size; + execute_gpu_glwe_packing(c, config.clone()); } - - bench_group.finish() } - fn gpu_glwe_packing_128(c: &mut Criterion) { - let bench_name = "integer::cuda::128b_packing_compression"; - let mut bench_group = c.benchmark_group(bench_name); - bench_group - .sample_size(15) - .measurement_time(std::time::Duration::from_secs(30)); - - let stream = CudaStreams::new_multi_gpu(); - - let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; - let noise_squashing_compression_parameters = - BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; - let noise_squashing_parameters = - BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + fn gpu_glwe_unpacking(c: &mut Criterion) { + let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + let comp_param = + BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; let log_message_modulus = param.message_modulus.0.ilog2() as usize; - let noise_squashing_compression_private_key = - NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters); - let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters); - let noise_squashing_compression_key = noise_squashing_private_key - .new_noise_squashing_compression_key(&noise_squashing_compression_private_key); - let cuda_noise_squashing_compression_key = - CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( - &noise_squashing_compression_key, - &stream, - ); + let cks = ClientKey::new(param); + let private_compression_key = cks.new_compression_private_key(comp_param); + let mut config = BenchConfig { + param: PBSParameters::MultiBitPBS(param), + comp_param, + bit_size: 0, + cks, + private_compression_key, + }; for bit_size in [ 2, 8, @@ -419,180 +519,31 @@ mod cuda { 32, 64, 128, - // we don't need 256 here since - // noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus == 256 - // with current parameters 256, - noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus, + 256, + comp_param.lwe_per_glwe().0 * log_message_modulus, ] { - assert_eq!(bit_size % log_message_modulus, 0); - let num_blocks = bit_size / log_message_modulus; - - let bench_id_pack; - let bench_id_unpack; - - // Generate and convert compression keys - let cks = ClientKey::new(param); - let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream); - let compressed_noise_squashing_compression_key = - cks.new_compressed_noise_squashing_key(&noise_squashing_private_key); - - match get_bench_type() { - BenchmarkType::Latency => { - let cuda_noise_squashing_key = - compressed_noise_squashing_compression_key.decompress_to_cuda(&stream); - - // Encrypt - let ct = cks.encrypt_radix(0_u32, num_blocks); - let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); - let d_ns_ct = cuda_noise_squashing_key - .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream) - .unwrap(); - - // Benchmark - let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder(); - - builder.push(d_ns_ct, &stream); - - bench_id_pack = format!("{bench_name}::pack_u{bit_size}"); - bench_group.bench_function(&bench_id_pack, |b| { - b.iter(|| { - let compressed = - builder.build(&cuda_noise_squashing_compression_key, &stream); - - _ = black_box(compressed); - }) - }); - - let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream); - - bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}"); - bench_group.bench_function(&bench_id_unpack, |b| { - b.iter(|| { - let unpacked: CudaSquashedNoiseRadixCiphertext = - compressed.get(0, &stream).unwrap().unwrap(); - - _ = black_box(unpacked); - }) - }); - } - BenchmarkType::Throughput => { - let num_block = (bit_size as f64 / (param.message_modulus.0 as f64).log(2.0)) - .ceil() as usize; - let elements = 100; - bench_group.throughput(Throughput::Elements(elements)); - - // Encrypt - let local_streams = cuda_local_streams(num_block, elements as usize); - - let cuda_compression_key_vec = local_streams - .iter() - .map(|local_stream| { - compressed_noise_squashing_compression_key - .decompress_to_cuda(local_stream) - }) - .collect_vec(); - - let cuda_noise_squashing_compression_key = - CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( - &noise_squashing_compression_key, - &stream, - ); - - // Benchmark - let builders = (0..elements) - .map(|i| { - let ct = cks.encrypt_radix(0_u32, num_blocks); - let local_stream = &local_streams[i as usize % local_streams.len()]; - let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext( - &ct, - local_stream, - ); - let cuda_noise_squashing_key = - &cuda_compression_key_vec[(i as usize) % local_streams.len()]; - let d_ns_ct = cuda_noise_squashing_key - .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream) - .unwrap(); - let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder(); - builder.push(d_ns_ct, local_stream); - - builder - }) - .collect::>(); - - bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}"); - bench_group.bench_function(&bench_id_pack, |b| { - b.iter(|| { - builders.par_iter().enumerate().for_each(|(i, builder)| { - let local_stream = &local_streams[i % local_streams.len()]; - - builder.build(&cuda_noise_squashing_compression_key, local_stream); - }) - }) - }); - - let compressed = builders - .iter() - .enumerate() - .map(|(i, builder)| { - let local_stream = &local_streams[i % local_streams.len()]; - - builder.build(&cuda_noise_squashing_compression_key, local_stream) - }) - .collect::>(); - - bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}"); - bench_group.bench_function(&bench_id_unpack, |b| { - b.iter(|| { - compressed.par_iter().enumerate().for_each(|(i, comp)| { - let local_stream = &local_streams[i % local_streams.len()]; - - comp.get::(0, local_stream) - .unwrap() - .unwrap(); - }) - }) - }); - } - } - - write_to_json::( - &bench_id_pack, - (noise_squashing_compression_parameters, param.into()), - noise_squashing_compression_parameters.name(), - "pack", - &OperatorType::Atomic, - bit_size as u32, - vec![param.message_modulus.0.ilog2(); num_blocks], - ); - - write_to_json::( - &bench_id_unpack, - (noise_squashing_compression_parameters, param.into()), - noise_squashing_compression_parameters.name(), - "unpack", - &OperatorType::Atomic, - bit_size as u32, - vec![param.message_modulus.0.ilog2(); num_blocks], - ); + config.bit_size = bit_size; + execute_gpu_glwe_unpacking(c, config.clone()); } - - bench_group.finish() } criterion_group!(gpu_glwe_packing2, gpu_glwe_packing); - criterion_group!(gpu_glwe_packing_128_2, gpu_glwe_packing_128); + criterion_group!(gpu_glwe_unpacking2, gpu_glwe_unpacking); } criterion_group!(cpu_glwe_packing2, cpu_glwe_packing); #[cfg(feature = "gpu")] -use cuda::{gpu_glwe_packing2, gpu_glwe_packing_128_2}; +use cuda::gpu_glwe_packing2; +#[cfg(feature = "gpu")] +use cuda::gpu_glwe_unpacking2; fn main() { #[cfg(feature = "gpu")] - gpu_glwe_packing2(); - #[cfg(feature = "gpu")] - gpu_glwe_packing_128_2(); + { + gpu_glwe_packing2(); + gpu_glwe_unpacking2(); + } #[cfg(not(feature = "gpu"))] cpu_glwe_packing2(); diff --git a/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs b/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs new file mode 100644 index 000000000..8fd92a595 --- /dev/null +++ b/tfhe-benchmark/benches/integer/glwe_packing_compression_128b.rs @@ -0,0 +1,444 @@ +#[cfg(feature = "gpu")] +mod cuda { + use benchmark::params_aliases::*; + use benchmark::utilities::cuda_integer_utils::cuda_local_streams; + use benchmark::utilities::{ + cuda_local_keys, get_bench_type, write_to_json, BenchmarkType, OperatorType, + }; + use criterion::{black_box, criterion_group, Criterion, Throughput}; + use rayon::prelude::*; + use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams}; + use tfhe::integer::ciphertext::{ + NoiseSquashingCompressionKey, NoiseSquashingCompressionPrivateKey, + }; + use tfhe::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext; + use tfhe::integer::gpu::ciphertext::{ + CudaCompressedSquashedNoiseCiphertextList, CudaUnsignedRadixCiphertext, + }; + use tfhe::integer::gpu::gen_keys_radix_gpu; + use tfhe::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey; + use tfhe::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey; + use tfhe::integer::noise_squashing::{CompressedNoiseSquashingKey, NoiseSquashingPrivateKey}; + use tfhe::integer::ClientKey; + use tfhe::keycache::NamedParam; + use tfhe::shortint::parameters::NoiseSquashingCompressionParameters; + use tfhe::shortint::PBSParameters; + + #[derive(Clone)] + struct BenchConfig { + param: PBSParameters, + noise_squashing_compression_parameters: NoiseSquashingCompressionParameters, + noise_squashing_compression_key: NoiseSquashingCompressionKey, + compressed_noise_squashing_compression_key: CompressedNoiseSquashingKey, + bit_size: usize, + cks: ClientKey, + } + + fn get_num_elements_per_gpu(_bit_size: usize) -> usize { + // 200 elements per GPU seems enough to saturate H100s + // This is an empirical value and might need to be adjusted in the future + 200 + } + + fn execute_gpu_glwe_packing_128(c: &mut Criterion, config: BenchConfig) { + let bench_name = "integer::cuda::128b_packing_compression"; + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(30)); + + let stream = CudaStreams::new_multi_gpu(); + + let BenchConfig { + param, + noise_squashing_compression_parameters, + noise_squashing_compression_key, + compressed_noise_squashing_compression_key, + bit_size, + cks, + } = config; + + let log_message_modulus = param.message_modulus().0.ilog2() as usize; + + assert_eq!(bit_size % log_message_modulus, 0); + let num_blocks = bit_size / log_message_modulus; + + let bench_id_pack; + + match get_bench_type() { + BenchmarkType::Latency => { + let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream); + let cuda_noise_squashing_key = + compressed_noise_squashing_compression_key.decompress_to_cuda(&stream); + + // Encrypt + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + let d_ns_ct = cuda_noise_squashing_key + .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream) + .unwrap(); + let cuda_noise_squashing_compression_key = + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &noise_squashing_compression_key, + &stream, + ); + + // Benchmark + let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder(); + + builder.push(d_ns_ct, &stream); + + bench_id_pack = format!("{bench_name}::pack_u{bit_size}"); + bench_group.bench_function(&bench_id_pack, |b| { + b.iter(|| { + let compressed = + builder.build(&cuda_noise_squashing_compression_key, &stream); + + _ = black_box(compressed); + }) + }); + } + BenchmarkType::Throughput => { + let cuda_sks = cuda_local_keys(&cks); + let num_block = + (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize; + let elements = get_num_elements_per_gpu(bit_size) as u64; + bench_group.throughput(Throughput::Elements(elements)); + + // Encrypt + let local_streams = cuda_local_streams(num_block, elements as usize); + + let num_gpus = get_number_of_gpus() as usize; + + let cuda_compression_key_vec: Vec = (0..num_gpus) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i % local_streams.len()]; + compressed_noise_squashing_compression_key.decompress_to_cuda(local_stream) + }) + .collect(); + let cuda_noise_squashing_compression_key_vec: Vec< + CudaNoiseSquashingCompressionKey, + > = (0..num_gpus) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i % local_streams.len()]; + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &noise_squashing_compression_key, + local_stream, + ) + }) + .collect(); + + // Benchmark + let builders = (0..elements) + .into_par_iter() + .map(|i| { + let ct = cks.encrypt_radix(0_u32, num_blocks); + let local_stream = &local_streams[i as usize % local_streams.len()]; + let d_ct = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream); + let cuda_noise_squashing_key = + &cuda_compression_key_vec[(i as usize) % num_gpus]; + let cuda_noise_squashing_compression_key = + &cuda_noise_squashing_compression_key_vec[(i as usize) % num_gpus]; + let d_ns_ct = cuda_noise_squashing_key + .squash_radix_ciphertext_noise( + &cuda_sks[(i as usize) % num_gpus], + &d_ct.ciphertext, + local_stream, + ) + .unwrap(); + let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder(); + builder.push(d_ns_ct, local_stream); + + (builder, cuda_noise_squashing_compression_key, local_stream) + }) + .collect::>(); + + bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}"); + bench_group.bench_function(&bench_id_pack, |b| { + b.iter(|| { + builders.par_iter().for_each( + |(builder, cuda_noise_squashing_compression_key, local_stream)| { + builder.build(cuda_noise_squashing_compression_key, local_stream); + }, + ) + }) + }); + } + } + + write_to_json::( + &bench_id_pack, + (noise_squashing_compression_parameters, param.into()), + noise_squashing_compression_parameters.name(), + "pack", + &OperatorType::Atomic, + bit_size as u32, + vec![param.message_modulus().0.ilog2(); num_blocks], + ); + + bench_group.finish() + } + + fn execute_gpu_glwe_unpacking_128(c: &mut Criterion, config: BenchConfig) { + let bench_name = "integer::cuda::128b_packing_compression"; + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(30)); + + let stream = CudaStreams::new_multi_gpu(); + + let BenchConfig { + param, + noise_squashing_compression_parameters, + noise_squashing_compression_key, + compressed_noise_squashing_compression_key, + bit_size, + cks, + } = config; + + let log_message_modulus = param.message_modulus().0.ilog2() as usize; + + assert_eq!(bit_size % log_message_modulus, 0); + let num_blocks = bit_size / log_message_modulus; + + let bench_id_unpack; + + match get_bench_type() { + BenchmarkType::Latency => { + let (_, cuda_sks) = gen_keys_radix_gpu(param, num_blocks, &stream); + let cuda_noise_squashing_key = + compressed_noise_squashing_compression_key.decompress_to_cuda(&stream); + + // Encrypt + let ct = cks.encrypt_radix(0_u32, num_blocks); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream); + let d_ns_ct = cuda_noise_squashing_key + .squash_radix_ciphertext_noise(&cuda_sks, &d_ct.ciphertext, &stream) + .unwrap(); + let cuda_noise_squashing_compression_key = + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &noise_squashing_compression_key, + &stream, + ); + + // Benchmark + let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder(); + + builder.push(d_ns_ct, &stream); + + let compressed = builder.build(&cuda_noise_squashing_compression_key, &stream); + + bench_id_unpack = format!("{bench_name}::unpack_u{bit_size}"); + bench_group.bench_function(&bench_id_unpack, |b| { + b.iter(|| { + let unpacked: CudaSquashedNoiseRadixCiphertext = + compressed.get(0, &stream).unwrap().unwrap(); + + _ = black_box(unpacked); + }) + }); + } + BenchmarkType::Throughput => { + let cuda_sks = cuda_local_keys(&cks); + let num_block = + (bit_size as f64 / (param.message_modulus().0 as f64).log(2.0)).ceil() as usize; + let elements = get_num_elements_per_gpu(bit_size) as u64; + bench_group.throughput(Throughput::Elements(elements)); + + // Encrypt + let local_streams = cuda_local_streams(num_block, elements as usize); + + let num_gpus = get_number_of_gpus() as usize; + + let cuda_compression_key_vec: Vec = (0..num_gpus) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i % local_streams.len()]; + compressed_noise_squashing_compression_key.decompress_to_cuda(local_stream) + }) + .collect(); + let cuda_noise_squashing_compression_key_vec: Vec< + CudaNoiseSquashingCompressionKey, + > = (0..num_gpus) + .into_par_iter() + .map(|i| { + let local_stream = &local_streams[i % local_streams.len()]; + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &noise_squashing_compression_key, + local_stream, + ) + }) + .collect(); + + // Benchmark + let builders = (0..elements) + .into_par_iter() + .map(|i| { + let ct = cks.encrypt_radix(0_u32, num_blocks); + let local_stream = &local_streams[i as usize % local_streams.len()]; + let d_ct = + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, local_stream); + let cuda_noise_squashing_key = + &cuda_compression_key_vec[(i as usize) % num_gpus]; + let cuda_noise_squashing_compression_key = + &cuda_noise_squashing_compression_key_vec[(i as usize) % num_gpus]; + let d_ns_ct = cuda_noise_squashing_key + .squash_radix_ciphertext_noise( + &cuda_sks[(i as usize) % num_gpus], + &d_ct.ciphertext, + local_stream, + ) + .unwrap(); + let mut builder = CudaCompressedSquashedNoiseCiphertextList::builder(); + builder.push(d_ns_ct, local_stream); + + (builder, cuda_noise_squashing_compression_key, local_stream) + }) + .collect::>(); + + let compressed = builders + .into_par_iter() + .map( + |(builder, cuda_noise_squashing_compression_key, local_stream)| { + builder.build(cuda_noise_squashing_compression_key, local_stream) + }, + ) + .collect::>(); + + bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}"); + bench_group.bench_function(&bench_id_unpack, |b| { + b.iter(|| { + compressed.par_iter().enumerate().for_each(|(i, comp)| { + let local_stream = &local_streams[i % local_streams.len()]; + + comp.get::(0, local_stream) + .unwrap() + .unwrap(); + }) + }) + }); + } + } + + write_to_json::( + &bench_id_unpack, + (noise_squashing_compression_parameters, param.into()), + noise_squashing_compression_parameters.name(), + "unpack", + &OperatorType::Atomic, + bit_size as u32, + vec![param.message_modulus().0.ilog2(); num_blocks], + ); + + bench_group.finish() + } + + fn gpu_glwe_packing_128(c: &mut Criterion) { + let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + let noise_squashing_compression_parameters = + BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + let noise_squashing_parameters = + BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + + let log_message_modulus = param.message_modulus.0.ilog2() as usize; + + let cks = ClientKey::new(param); + + let noise_squashing_compression_private_key = + NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters); + let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters); + let noise_squashing_compression_key = noise_squashing_private_key + .new_noise_squashing_compression_key(&noise_squashing_compression_private_key); + + // Generate and convert compression keys + let compressed_noise_squashing_compression_key = + cks.new_compressed_noise_squashing_key(&noise_squashing_private_key); + + let mut config = BenchConfig { + param: PBSParameters::PBS(param), + noise_squashing_compression_key, + noise_squashing_compression_parameters, + compressed_noise_squashing_compression_key, + bit_size: 0, + cks, + }; + for bit_size in [ + 2, + 8, + 16, + 32, + 64, + 128, + 256, + noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus, + ] { + config.bit_size = bit_size; + execute_gpu_glwe_packing_128(c, config.clone()); + } + } + + fn gpu_glwe_unpacking_128(c: &mut Criterion) { + let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + let noise_squashing_compression_parameters = + BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + let noise_squashing_parameters = + BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + + let log_message_modulus = param.message_modulus.0.ilog2() as usize; + + let cks = ClientKey::new(param); + + let noise_squashing_compression_private_key = + NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_parameters); + let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_parameters); + let noise_squashing_compression_key = noise_squashing_private_key + .new_noise_squashing_compression_key(&noise_squashing_compression_private_key); + + // Generate and convert compression keys + let compressed_noise_squashing_compression_key = + cks.new_compressed_noise_squashing_key(&noise_squashing_private_key); + + let mut config = BenchConfig { + param: PBSParameters::PBS(param), + noise_squashing_compression_key, + noise_squashing_compression_parameters, + compressed_noise_squashing_compression_key, + bit_size: 0, + cks, + }; + for bit_size in [ + 2, + 8, + 16, + 32, + 64, + 128, + 256, + noise_squashing_compression_parameters.lwe_per_glwe.0 * log_message_modulus, + ] { + config.bit_size = bit_size; + execute_gpu_glwe_unpacking_128(c, config.clone()); + } + } + + criterion_group!(gpu_glwe_packing_128_2, gpu_glwe_packing_128); + criterion_group!(gpu_glwe_unpacking_128_2, gpu_glwe_unpacking_128); +} + +use criterion::Criterion; +#[cfg(feature = "gpu")] +use cuda::gpu_glwe_packing_128_2; +#[cfg(feature = "gpu")] +use cuda::gpu_glwe_unpacking_128_2; + +fn main() { + #[cfg(feature = "gpu")] + gpu_glwe_packing_128_2(); + #[cfg(feature = "gpu")] + gpu_glwe_unpacking_128_2(); + Criterion::default().configure_from_args().final_summary(); +}