diff --git a/tfhe-benchmark/benches/integer/zk_pke.rs b/tfhe-benchmark/benches/integer/zk_pke.rs index d512ddb1a..b3c0c0854 100644 --- a/tfhe-benchmark/benches/integer/zk_pke.rs +++ b/tfhe-benchmark/benches/integer/zk_pke.rs @@ -421,8 +421,6 @@ mod cuda { .sample_size(15) .measurement_time(std::time::Duration::from_secs(60)); - let streams = CudaStreams::new_multi_gpu(); - File::create(results_file).expect("create results file failed"); let mut file = OpenOptions::new() .append(true) @@ -439,17 +437,10 @@ mod cuda { let cks = ClientKey::new(param_fhe); let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); let sk = compressed_server_key.decompress(); - let gpu_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); let compact_private_key = CompactPrivateKey::new(param_pke); let pk = CompactPublicKey::new(&compact_private_key); let ksk = KeySwitchingKey::new((&compact_private_key, None), (&cks, &sk), param_ksk); - let d_ksk_material = - CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams); - let d_ksk = CudaKeySwitchingKey::from_cuda_key_switching_key_material( - &d_ksk_material, - &gpu_sks, - ); // We have a use case with 320 bits of metadata let mut metadata = [0u8; (320 / u8::BITS) as usize]; @@ -509,6 +500,18 @@ mod cuda { match get_bench_type() { BenchmarkType::Latency => { + let streams = CudaStreams::new_multi_gpu(); + let gpu_sks = CudaServerKey::decompress_from_cpu( + &compressed_server_key, + &streams, + ); + let d_ksk_material = + CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams); + let d_ksk = CudaKeySwitchingKey::from_cuda_key_switching_key_material( + &d_ksk_material, + &gpu_sks, + ); + bench_id_verify = format!( "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}" ); @@ -599,9 +602,7 @@ mod cuda { }); } BenchmarkType::Throughput => { - let gpu_count = get_number_of_gpus() as usize; - - let elements = zk_throughput_num_elements(); + let elements = 100 * get_number_of_gpus() as u64; // This value, found empirically, ensure saturation of 8XH100 SXM5 bench_group.throughput(Throughput::Elements(elements)); bench_id_verify = format!( @@ -636,8 +637,6 @@ mod cuda { }) .collect::>(); - assert_eq!(d_ksk_material_vec.len(), gpu_count); - bench_group.bench_function(&bench_id_verify, |b| { b.iter(|| { cts.par_iter().for_each(|ct1| { @@ -648,23 +647,25 @@ mod cuda { bench_group.bench_function(&bench_id_expand_without_verify, |b| { let setup_encrypted_values = || { - let local_streams = cuda_local_streams(num_block, elements as usize); - let gpu_cts = cts.iter().enumerate().map(|(i, ct)| { + let local_stream = &local_streams[i % local_streams.len()]; CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list( - ct, &local_streams[i], + ct, local_stream, ) }).collect_vec(); - (gpu_cts, local_streams) + gpu_cts }; b.iter_batched(setup_encrypted_values, - |(gpu_cts, local_streams)| { - gpu_cts.par_iter().zip(local_streams.par_iter()).enumerate().for_each - (|(i, (gpu_ct, local_stream))| { + |gpu_cts| { + gpu_cts.par_iter().enumerate().for_each + (|(i, gpu_ct)| { + let local_stream = &local_streams[i % local_streams.len()]; + + let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream); let d_ksk = - CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % gpu_count], &gpu_sks); + CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk); gpu_ct .expand_without_verification(&d_ksk, local_stream) @@ -675,21 +676,24 @@ mod cuda { bench_group.bench_function(&bench_id_verify_and_expand, |b| { let setup_encrypted_values = || { - let local_streams = cuda_local_streams(num_block, elements as usize); - let gpu_cts = cts.iter().enumerate().map(|(i, ct)| { CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list( - ct, &local_streams[i], + ct, &local_streams[i% local_streams.len()], ) }).collect_vec(); - (gpu_cts, local_streams) + gpu_cts }; b.iter_batched(setup_encrypted_values, - |(gpu_cts, local_streams)| { - gpu_cts.par_iter().zip(local_streams.par_iter()).for_each - (|(gpu_ct, local_stream)| { + |gpu_cts| { + gpu_cts.par_iter().enumerate().for_each + (|(i, gpu_ct)| { + let local_stream = &local_streams[i % local_streams.len()]; + let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream); + let d_ksk = + CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk); + gpu_ct .verify_and_expand( &crs, &pk, &metadata, &d_ksk, local_stream, diff --git a/tfhe/src/integer/gpu/ciphertext/compact_list.rs b/tfhe/src/integer/gpu/ciphertext/compact_list.rs index ed5ae6325..727b3448f 100644 --- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs +++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs @@ -8,12 +8,10 @@ use crate::core_crypto::prelude::{ use crate::integer::ciphertext::{CompactCiphertextListExpander, DataKind}; use crate::integer::gpu::ciphertext::compressed_ciphertext_list::CudaExpandable; use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; -use crate::integer::gpu::ciphertext::{ - expand_async, CudaRadixCiphertext, CudaVec, KsType, LweDimension, -}; +use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaVec, KsType, LweDimension}; use crate::integer::gpu::key_switching_key::CudaKeySwitchingKey; use crate::integer::gpu::server_key::CudaBootstrappingKey; -use crate::integer::gpu::PBSType; +use crate::integer::gpu::{expand_async, PBSType}; use crate::shortint::ciphertext::CompactCiphertextList; use crate::shortint::parameters::{ CompactCiphertextListExpansionKind, Degree, LweBskGroupingFactor, NoiseLevel, diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs index 80aa2d745..f880ab4c6 100644 --- a/tfhe/src/integer/gpu/ciphertext/mod.rs +++ b/tfhe/src/integer/gpu/ciphertext/mod.rs @@ -4,27 +4,15 @@ pub mod compressed_ciphertext_list; pub mod info; pub mod squashed_noise; -use crate::core_crypto::gpu::lwe_bootstrap_key::{ - prepare_cuda_ms_noise_reduction_key_ffi, CudaModulusSwitchNoiseReductionKey, -}; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::CudaStreams; -use crate::core_crypto::prelude::{ - LweBskGroupingFactor, LweCiphertextList, LweCiphertextOwned, Numeric, UnsignedInteger, -}; +use crate::core_crypto::prelude::{LweCiphertextList, LweCiphertextOwned}; use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo}; -use crate::integer::gpu::PBSType; -use crate::integer::parameters::{ - DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweDimension, PolynomialSize, -}; +use crate::integer::parameters::LweDimension; use crate::integer::{IntegerCiphertext, RadixCiphertext, SignedRadixCiphertext}; -use crate::shortint::{CarryModulus, Ciphertext, EncryptionKeyChoice, MessageModulus}; +use crate::shortint::{Ciphertext, EncryptionKeyChoice}; use crate::GpuIndex; -use tfhe_cuda_backend::bindings::{ - cleanup_expand_without_verification_64, cuda_expand_without_verification_64, - scratch_cuda_expand_without_verification_64, -}; pub trait CudaIntegerRadixCiphertext: Sized { const IS_SIGNED: bool; @@ -527,100 +515,3 @@ impl From for KsType { } } } - -#[allow(clippy::too_many_arguments)] -/// # Safety -/// -/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not -/// be dropped until stream is synchronised -/// -/// -/// In this method, the input `lwe_flattened_compact_array_in` represents a flattened compact list. -/// Instead of receiving a `Vec`, it takes a concatenation of all LWEs -/// that were inside that vector of compact list. Handling the input this way removes the need -/// to process multiple compact lists separately, simplifying GPU-based operations. The variable -/// name `lwe_flattened_compact_array_in` makes this intent explicit. -pub unsafe fn expand_async( - streams: &CudaStreams, - lwe_array_out: &mut CudaLweCiphertextList, - lwe_flattened_compact_array_in: &CudaVec, - bootstrapping_key: &CudaVec, - computing_ks_key: &CudaVec, - casting_key: &CudaVec, - message_modulus: MessageModulus, - carry_modulus: CarryModulus, - computing_glwe_dimension: GlweDimension, - computing_polynomial_size: PolynomialSize, - computing_lwe_dimension: LweDimension, - computing_ks_level: DecompositionLevelCount, - computing_ks_base_log: DecompositionBaseLog, - casting_input_lwe_dimension: LweDimension, - casting_output_lwe_dimension: LweDimension, - casting_ks_level: DecompositionLevelCount, - casting_ks_base_log: DecompositionBaseLog, - pbs_level: DecompositionLevelCount, - pbs_base_log: DecompositionBaseLog, - pbs_type: PBSType, - casting_key_type: KsType, - grouping_factor: LweBskGroupingFactor, - num_lwes_per_compact_list: &[u32], - is_boolean: &[bool], - noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>, -) { - let ct_modulus = lwe_array_out.ciphertext_modulus().raw_modulus_float(); - let mut mem_ptr: *mut i8 = std::ptr::null_mut(); - let num_compact_lists = num_lwes_per_compact_list.len(); - - let ms_noise_reduction_key_ffi = - prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus); - let allocate_ms_noise_array = noise_reduction_key.is_some(); - - scratch_cuda_expand_without_verification_64( - streams.ptr.as_ptr(), - streams.gpu_indexes_ptr(), - streams.len() as u32, - std::ptr::addr_of_mut!(mem_ptr), - computing_glwe_dimension.0 as u32, - computing_polynomial_size.0 as u32, - computing_glwe_dimension - .to_equivalent_lwe_dimension(computing_polynomial_size) - .0 as u32, - computing_lwe_dimension.0 as u32, - computing_ks_level.0 as u32, - computing_ks_base_log.0 as u32, - casting_input_lwe_dimension.0 as u32, - casting_output_lwe_dimension.0 as u32, - casting_ks_level.0 as u32, - casting_ks_base_log.0 as u32, - pbs_level.0 as u32, - pbs_base_log.0 as u32, - grouping_factor.0 as u32, - num_lwes_per_compact_list.as_ptr(), - is_boolean.as_ptr(), - num_compact_lists as u32, - message_modulus.0 as u32, - carry_modulus.0 as u32, - pbs_type as u32, - casting_key_type as u32, - true, - allocate_ms_noise_array, - ); - cuda_expand_without_verification_64( - streams.ptr.as_ptr(), - streams.gpu_indexes_ptr(), - streams.len() as u32, - lwe_array_out.0.d_vec.as_mut_c_ptr(0), - lwe_flattened_compact_array_in.as_c_ptr(0), - mem_ptr, - bootstrapping_key.ptr.as_ptr(), - computing_ks_key.ptr.as_ptr(), - casting_key.ptr.as_ptr(), - &raw const ms_noise_reduction_key_ffi, - ); - cleanup_expand_without_verification_64( - streams.ptr.as_ptr(), - streams.gpu_indexes_ptr(), - streams.len() as u32, - std::ptr::addr_of_mut!(mem_ptr), - ); -} diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index b2b38339f..d58ef46b8 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -10,6 +10,7 @@ pub mod zk; use crate::core_crypto::gpu::lwe_bootstrap_key::{ prepare_cuda_ms_noise_reduction_key_ffi, CudaModulusSwitchNoiseReductionKey, }; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::CudaStreams; @@ -19,7 +20,7 @@ use crate::core_crypto::prelude::{ }; use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto}; use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; -use crate::integer::gpu::ciphertext::CudaRadixCiphertext; +use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, KsType}; use crate::integer::server_key::radix_parallel::OutputFlag; use crate::integer::server_key::ScalarMultiplier; use crate::integer::{ClientKey, RadixClientKey}; @@ -6643,3 +6644,135 @@ pub unsafe fn noise_squashing_async( std::ptr::addr_of_mut!(mem_ptr), ); } + +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not +/// be dropped until stream is synchronised +/// +/// +/// In this method, the input `lwe_flattened_compact_array_in` represents a flattened compact list. +/// Instead of receiving a `Vec`, it takes a concatenation of all LWEs +/// that were inside that vector of compact list. Handling the input this way removes the need +/// to process multiple compact lists separately, simplifying GPU-based operations. The variable +/// name `lwe_flattened_compact_array_in` makes this intent explicit. +pub unsafe fn expand_async( + streams: &CudaStreams, + lwe_array_out: &mut CudaLweCiphertextList, + lwe_flattened_compact_array_in: &CudaVec, + bootstrapping_key: &CudaVec, + computing_ks_key: &CudaVec, + casting_key: &CudaVec, + message_modulus: MessageModulus, + carry_modulus: CarryModulus, + computing_glwe_dimension: GlweDimension, + computing_polynomial_size: PolynomialSize, + computing_lwe_dimension: LweDimension, + computing_ks_level: DecompositionLevelCount, + computing_ks_base_log: DecompositionBaseLog, + casting_input_lwe_dimension: LweDimension, + casting_output_lwe_dimension: LweDimension, + casting_ks_level: DecompositionLevelCount, + casting_ks_base_log: DecompositionBaseLog, + pbs_level: DecompositionLevelCount, + pbs_base_log: DecompositionBaseLog, + pbs_type: PBSType, + casting_key_type: KsType, + grouping_factor: LweBskGroupingFactor, + num_lwes_per_compact_list: &[u32], + is_boolean: &[bool], + noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>, +) { + assert_eq!( + streams.gpu_indexes[0], + lwe_array_out.0.d_vec.gpu_index(0), + "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", + streams.gpu_indexes[0].get(), + lwe_array_out.0.d_vec.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + lwe_flattened_compact_array_in.gpu_index(0), + "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", + streams.gpu_indexes[0].get(), + lwe_flattened_compact_array_in.gpu_index(0).get(), + ); + assert_eq!( + streams.gpu_indexes[0], + bootstrapping_key.gpu_indexes[0], + "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", + streams.gpu_indexes[0].get(), + bootstrapping_key.gpu_indexes[0].get(), + ); + assert_eq!( + streams.gpu_indexes[0], + computing_ks_key.gpu_indexes[0], + "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", + streams.gpu_indexes[0].get(), + computing_ks_key.gpu_indexes[0].get(), + ); + assert_eq!( + streams.gpu_indexes[0], + casting_key.gpu_indexes[0], + "GPU error: first stream is on GPU {}, first output pointer is on GPU {}", + streams.gpu_indexes[0].get(), + casting_key.gpu_indexes[0].get(), + ); + let ct_modulus = lwe_array_out.ciphertext_modulus().raw_modulus_float(); + let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + let num_compact_lists = num_lwes_per_compact_list.len(); + + let ms_noise_reduction_key_ffi = + prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus); + let allocate_ms_noise_array = noise_reduction_key.is_some(); + + scratch_cuda_expand_without_verification_64( + streams.ptr.as_ptr(), + streams.gpu_indexes_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + computing_glwe_dimension.0 as u32, + computing_polynomial_size.0 as u32, + computing_glwe_dimension + .to_equivalent_lwe_dimension(computing_polynomial_size) + .0 as u32, + computing_lwe_dimension.0 as u32, + computing_ks_level.0 as u32, + computing_ks_base_log.0 as u32, + casting_input_lwe_dimension.0 as u32, + casting_output_lwe_dimension.0 as u32, + casting_ks_level.0 as u32, + casting_ks_base_log.0 as u32, + pbs_level.0 as u32, + pbs_base_log.0 as u32, + grouping_factor.0 as u32, + num_lwes_per_compact_list.as_ptr(), + is_boolean.as_ptr(), + num_compact_lists as u32, + message_modulus.0 as u32, + carry_modulus.0 as u32, + pbs_type as u32, + casting_key_type as u32, + true, + allocate_ms_noise_array, + ); + cuda_expand_without_verification_64( + streams.ptr.as_ptr(), + streams.gpu_indexes_ptr(), + streams.len() as u32, + lwe_array_out.0.d_vec.as_mut_c_ptr(0), + lwe_flattened_compact_array_in.as_c_ptr(0), + mem_ptr, + bootstrapping_key.ptr.as_ptr(), + computing_ks_key.ptr.as_ptr(), + casting_key.ptr.as_ptr(), + &raw const ms_noise_reduction_key_ffi, + ); + cleanup_expand_without_verification_64( + streams.ptr.as_ptr(), + streams.gpu_indexes_ptr(), + streams.len() as u32, + std::ptr::addr_of_mut!(mem_ptr), + ); +}