diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h index e4c6269e0..297516cbb 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h @@ -65,6 +65,16 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams, void cleanup_cuda_integer_decompress_radix_ciphertext_128( CudaStreamsFFI streams, int8_t **mem_ptr_void); + +void cuda_integer_extract_glwe_128( + CudaStreamsFFI streams, void *glwe_array_out, + CudaPackedGlweCiphertextListFFI const *glwe_list, + uint32_t const glwe_index); + +void cuda_integer_extract_glwe_64( + CudaStreamsFFI streams, void *glwe_array_out, + CudaPackedGlweCiphertextListFFI const *glwe_list, + uint32_t const glwe_index); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu index 2c044a7af..7d2ffc123 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu @@ -155,3 +155,24 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_128( delete mem_ptr; *mem_ptr_void = nullptr; } + +void cuda_integer_extract_glwe_128( + CudaStreamsFFI streams, void *glwe_array_out, + CudaPackedGlweCiphertextListFFI const *glwe_list, + uint32_t const glwe_index) { + + CudaStreams _streams = CudaStreams(streams); + host_extract<__uint128_t>(_streams.stream(0), _streams.gpu_index(0), + (__uint128_t *)glwe_array_out, glwe_list, + glwe_index); +} + +void cuda_integer_extract_glwe_64( + CudaStreamsFFI streams, void *glwe_array_out, + CudaPackedGlweCiphertextListFFI const *glwe_list, + uint32_t const glwe_index) { + + CudaStreams _streams = CudaStreams(streams); + host_extract<__uint64_t>(_streams.stream(0), _streams.gpu_index(0), + (__uint64_t *)glwe_array_out, glwe_list, glwe_index); +} diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index d58512a03..94f46e7b8 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -2349,6 +2349,22 @@ unsafe extern "C" { mem_ptr_void: *mut *mut i8, ); } +unsafe extern "C" { + pub fn cuda_integer_extract_glwe_128( + streams: CudaStreamsFFI, + glwe_array_out: *mut ffi::c_void, + glwe_list: *const CudaPackedGlweCiphertextListFFI, + glwe_index: u32, + ); +} +unsafe extern "C" { + pub fn cuda_integer_extract_glwe_64( + streams: CudaStreamsFFI, + glwe_array_out: *mut ffi::c_void, + glwe_list: *const CudaPackedGlweCiphertextListFFI, + glwe_index: u32, + ); +} unsafe extern "C" { pub fn scratch_cuda_rerand_64( streams: CudaStreamsFFI, diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 9d8743cca..eaf188460 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -877,7 +877,7 @@ pub fn cuda_modulus_switch_ciphertext( Scalar: UnsignedInteger, { unsafe { - cuda_modulus_switch_ciphertext_async(streams, output_lwe_ciphertext, log_modulus); + cuda_modulus_switch_ciphertext_async(streams, &mut *output_lwe_ciphertext, log_modulus); } streams.synchronize(); } diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs index 6d3033eab..421f81903 100644 --- a/tfhe/src/integer/gpu/list_compression/server_keys.rs +++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs @@ -1,4 +1,5 @@ use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::CudaStreams; @@ -16,7 +17,8 @@ use crate::integer::gpu::ciphertext::CudaRadixCiphertext; use crate::integer::gpu::server_key::CudaBootstrappingKey; use crate::integer::gpu::{ cuda_backend_compress, cuda_backend_decompress, cuda_backend_get_compression_size_on_gpu, - cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu, PBSType, + cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu, extract_glwe_async, + PBSType, }; use crate::prelude::CastInto; use crate::shortint::ciphertext::{ @@ -197,6 +199,30 @@ impl CudaPackedGlweCiphertextList { meta: self.meta, } } + pub fn extract_glwe( + &self, + glwe_index: usize, + streams: &CudaStreams, + ) -> CudaGlweCiphertextList { + let meta = self + .meta + .as_ref() + .expect("CudaPackedGlweCiphertextList meta must be set to extract GLWE"); + + let mut output_cuda_glwe_list = CudaGlweCiphertextList::new( + meta.glwe_dimension, + meta.polynomial_size, + GlweCiphertextCount(1), + meta.ciphertext_modulus, + streams, + ); + + unsafe { + extract_glwe_async(streams, &mut output_cuda_glwe_list, self, glwe_index as u32); + } + streams.synchronize(); + output_cuda_glwe_list + } } impl Clone for CudaPackedGlweCiphertextList { diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 14bdd6508..70898eb38 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -7,6 +7,7 @@ pub mod server_key; #[cfg(feature = "zk-pok")] pub mod zk; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::lwe_compact_ciphertext_list::CudaLweCompactCiphertextList; @@ -10423,3 +10424,44 @@ pub unsafe fn unchecked_small_scalar_mul_integer_async( carry_modulus.0 as u32, ); } +#[allow(clippy::too_many_arguments)] +/// # Safety +/// +/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization +/// is required +pub unsafe fn extract_glwe_async( + streams: &CudaStreams, + glwe_array_out: &mut CudaGlweCiphertextList, + glwe_list: &CudaPackedGlweCiphertextList, + glwe_index: u32, +) { + assert_eq!( + streams.gpu_indexes[0], + glwe_array_out.0.d_vec.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + assert_eq!( + streams.gpu_indexes[0], + glwe_list.data.gpu_index(0), + "GPU error: all data should reside on the same GPU." + ); + let packed_glwe_list_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_list); + + if T::BITS == 128 { + cuda_integer_extract_glwe_128( + streams.ffi(), + glwe_array_out.0.d_vec.as_mut_c_ptr(0), + &raw const packed_glwe_list_ffi, + glwe_index, + ); + } else if T::BITS == 64 { + cuda_integer_extract_glwe_64( + streams.ffi(), + glwe_array_out.0.d_vec.as_mut_c_ptr(0), + &raw const packed_glwe_list_ffi, + glwe_index, + ); + } else { + panic!("Unsupported integer size for CUDA GLWE extraction"); + } +} diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs new file mode 100644 index 000000000..8094e80d3 --- /dev/null +++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs @@ -0,0 +1,756 @@ +use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources}; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::{GlweCiphertext, LweCiphertext}; +use crate::integer::compression_keys::CompressionPrivateKeys; +use crate::integer::gpu::list_compression::server_keys::CudaCompressionKey; +use crate::integer::gpu::server_key::radix::tests_noise_distribution::utils::noise_simulation::cuda_glwe_list_to_glwe_ciphertext; +use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test; +use crate::integer::gpu::server_key::radix::CudaUnsignedRadixCiphertext; +use crate::integer::gpu::CudaServerKey; +use crate::integer::{ClientKey, CompressedServerKey, IntegerCiphertext}; +use crate::shortint::ciphertext::{Ciphertext, Degree, NoiseLevel}; +use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey; +use crate::shortint::engine::ShortintEngine; +use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128; +use crate::shortint::parameters::{CompressionParameters, MetaParameters, Variance}; +use crate::shortint::server_key::tests::noise_distribution::br_dp_packingks_ms::br_dp_packing_ks_ms; +use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{ + NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk, + NoiseSimulationLwePackingKeyswitchKey, NoiseSimulationModulus, +}; +use crate::shortint::server_key::tests::noise_distribution::utils::{ + expected_pfail_for_precision, mean_and_variance_check, normality_check, pfail_check, + precision_with_padding, update_ap_params_msg_and_carry_moduli, DecryptionAndNoiseResult, + NoiseSample, PfailAndPrecision, PfailTestMeta, PfailTestResult, +}; +use crate::shortint::server_key::tests::noise_distribution::{ + should_run_short_pfail_tests_debug, should_use_single_key_debug, +}; +use crate::shortint::{ + AtomicPatternParameters, CarryModulus, MessageModulus, ShortintEncoding, ShortintParameterSet, +}; +use crate::GpuIndex; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; + +fn sanity_check_encrypt_br_dp_packing_ks_ms(meta_params: MetaParameters) { + let (params, comp_params) = ( + meta_params.compute_parameters, + meta_params.compression_parameters.unwrap(), + ); + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let block_params: ShortintParameterSet = params.into(); + let cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); + let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); + + let private_compression_key = cks.new_compression_private_key(comp_params); + let (compressed_compression_key, _compressed_decompression_key) = + cks.new_compressed_compression_decompression_keys(&private_compression_key); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams); + let lwe_per_glwe = cuda_compression_key.lwe_per_glwe; + // The multiplication done in the compression is made to move the message up at the top of the + // carry space, multiplying by the carry modulus achieves that + let dp_scalar = params.carry_modulus().0; + let br_input_modulus_log = cuda_sks.br_input_modulus_log(); + let storage_modulus_log = cuda_compression_key.storage_log_modulus; + + let id_lut = cuda_sks.generate_lookup_table(|x| x); + let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams); + + let input_zeros: Vec<_> = (0..lwe_per_glwe.0) + .map(|_| { + cks.key + .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0) + }) + .collect(); + let d_input_zeros: Vec<_> = input_zeros + .iter() + .map(|ct| { + let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), &streams); + CudaDynLwe::U64(d_ct_input) + }) + .collect(); + + let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo { + degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1), + message_modulus: params.message_modulus(), + carry_modulus: params.carry_modulus(), + atomic_pattern: params.atomic_pattern(), + noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL, + }; + let mut cuda_side_resources: Vec = (0..input_zeros.len()) + .map(|_| CudaSideResources::new(&streams, cuda_block_info)) + .collect(); + + let (d_before_packing, _after_packing, d_after_ms) = br_dp_packing_ks_ms( + d_input_zeros, + &cuda_sks, + &d_accumulator, + dp_scalar, + &cuda_compression_key.packing_key_switching_key, + storage_modulus_log, + &mut cuda_side_resources, + ); + + let compression_inputs: Vec<_> = d_before_packing + .into_iter() + .map(|(_input, pbs_result, _dp_result)| { + let pbs_result_list_cpu = pbs_result.as_lwe_64().to_lwe_ciphertext_list(&streams); + let pbs_result_cpu = LweCiphertext::from_container( + pbs_result_list_cpu.clone().into_container(), + pbs_result_list_cpu.ciphertext_modulus(), + ); + let cpu_ct = Ciphertext::new( + pbs_result_cpu, + Degree::new(params.message_modulus().0 - 1), + NoiseLevel::NOMINAL, + params.message_modulus(), + params.carry_modulus(), + params.atomic_pattern(), + ); + let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cpu_ct]); + let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&radix_ct, &streams); + d_ct.ciphertext + }) + .collect(); + + let gpu_compressed = + cuda_compression_key.compress_ciphertexts_into_list(&compression_inputs, &streams); + + let gpu_extracted = gpu_compressed.extract_glwe(0, &streams); + let extracted_list = gpu_extracted.to_glwe_ciphertext_list(&streams); + let extracted_glwe = GlweCiphertext::from_container( + extracted_list.clone().into_container(), + extracted_list.polynomial_size(), + extracted_list.ciphertext_modulus(), + ); + let after_ms_list = d_after_ms.to_glwe_ciphertext_list(&streams); + let mut after_ms = GlweCiphertext::from_container( + after_ms_list.clone().into_container(), + after_ms_list.polynomial_size(), + after_ms_list.ciphertext_modulus(), + ); + // Bodies that were not filled are discarded + after_ms.get_mut_body().as_mut()[lwe_per_glwe.0..].fill(0); + + assert_eq!(after_ms.as_view(), extracted_glwe.as_view()); +} + +create_gpu_parameterized_test!(sanity_check_encrypt_br_dp_packing_ks_ms { + TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128, +}); + +#[allow(clippy::type_complexity, clippy::too_many_arguments)] +fn encrypt_br_dp_packing_ks_ms_inner_helper_gpu( + params: AtomicPatternParameters, + comp_params: CompressionParameters, + single_cks: &ClientKey, + single_cuda_sks: &CudaServerKey, + single_compression_private_key: &CompressionPrivateKeys, + single_cuda_compression_key: &CudaCompressionKey, + msg: u64, + streams: &CudaStreams, +) -> ( + Vec<( + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + )>, + Vec, + Vec, +) { + let mut engine = ShortintEngine::new(); + let thread_cks: crate::integer::ClientKey; + let thread_cuda_sks: CudaServerKey; + let thread_compression_private_key; + let thread_cuda_compression_key; + let (cks, cuda_sks, compression_private_key, cuda_compression_key) = + if should_use_single_key_debug() { + ( + single_cks, + single_cuda_sks, + single_compression_private_key, + single_cuda_compression_key, + ) + } else { + let block_params: ShortintParameterSet = params.into(); + thread_cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = + CompressedServerKey::new_radix_compressed_server_key(&thread_cks); + thread_cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, streams); + + thread_compression_private_key = thread_cks.new_compression_private_key(comp_params); + let (compressed_compression_key, _compressed_decompression_key) = thread_cks + .new_compressed_compression_decompression_keys(&thread_compression_private_key); + thread_cuda_compression_key = compressed_compression_key.decompress_to_cuda(streams); + + ( + &thread_cks, + &thread_cuda_sks, + &thread_compression_private_key, + &thread_cuda_compression_key, + ) + }; + let br_input_modulus_log = cuda_sks.br_input_modulus_log(); + let lwe_per_glwe = cuda_compression_key.lwe_per_glwe; + + let input_zeros: Vec<_> = (0..lwe_per_glwe.0) + .map(|_| { + cks.key.encrypt_noiseless_pbs_input_dyn_lwe_with_engine( + br_input_modulus_log, + msg, + &mut engine, + ) + }) + .collect(); + + let d_input_zeros: Vec<_> = input_zeros + .iter() + .map(|ct| { + let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), streams); + CudaDynLwe::U64(d_ct_input) + }) + .collect(); + + let id_lut = cuda_sks.generate_lookup_table(|x| x); + let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, streams); + + let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo { + degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1), + message_modulus: params.message_modulus(), + carry_modulus: params.carry_modulus(), + atomic_pattern: params.atomic_pattern(), + noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL, + }; + let mut cuda_side_resources: Vec = (0..input_zeros.len()) + .map(|_| CudaSideResources::new(streams, cuda_block_info)) + .collect(); + + let dp_scalar = params.carry_modulus().0; + let storage_modulus_log = cuda_compression_key.storage_log_modulus; + + let (d_before_packing, d_after_packing, d_after_ms) = br_dp_packing_ks_ms( + d_input_zeros, + cuda_sks, + &d_accumulator, + dp_scalar, + &cuda_compression_key.packing_key_switching_key, + storage_modulus_log, + &mut cuda_side_resources, + ); + + let compute_large_lwe_secret_key = cks.key.encryption_key(); + let compression_glwe_secret_key = &compression_private_key.key.post_packing_ks_key; + + let compute_encoding = cuda_sks.encoding(); + let compression_encoding = ShortintEncoding { + carry_modulus: CarryModulus(1), + ..compute_encoding + }; + let after_packing = cuda_glwe_list_to_glwe_ciphertext(&d_after_packing, streams); + let after_ms = cuda_glwe_list_to_glwe_ciphertext(&d_after_ms, streams); + ( + d_before_packing + .into_iter() + .map(|(d_input, d_pbs_result, d_dp_result)| { + let input = d_input.as_ct_64_cpu(streams); + let pbs_result = d_pbs_result.as_ct_64_cpu(streams); + let dp_result = d_dp_result.as_ct_64_cpu(streams); + ( + match &cks.key.atomic_pattern { + AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => { + DecryptionAndNoiseResult::new_from_lwe( + &input, + &standard_atomic_pattern_client_key.lwe_secret_key, + msg, + &compute_encoding, + ) + } + AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => { + panic!("KS32 Atomic Pattern not supported on GPU tests yet"); + } + }, + DecryptionAndNoiseResult::new_from_lwe( + &pbs_result, + &compute_large_lwe_secret_key, + msg, + &compute_encoding, + ), + DecryptionAndNoiseResult::new_from_lwe( + &dp_result, + &compute_large_lwe_secret_key, + msg, + &compression_encoding, + ), + ) + }) + .collect(), + DecryptionAndNoiseResult::new_from_glwe( + &after_packing, + compression_glwe_secret_key, + compression_private_key.key.params.lwe_per_glwe(), + msg, + &compression_encoding, + ), + DecryptionAndNoiseResult::new_from_glwe( + &after_ms, + compression_glwe_secret_key, + compression_private_key.key.params.lwe_per_glwe(), + msg, + &compression_encoding, + ), + ) +} + +#[allow(clippy::type_complexity, clippy::too_many_arguments)] +fn encrypt_br_dp_packing_ks_ms_noise_helper_gpu( + params: AtomicPatternParameters, + comp_params: CompressionParameters, + single_cks: &ClientKey, + single_cuda_sks: &CudaServerKey, + single_compression_private_key: &CompressionPrivateKeys, + single_cuda_compression_key: &CudaCompressionKey, + msg: u64, + streams: &CudaStreams, +) -> ( + Vec<(NoiseSample, NoiseSample, NoiseSample)>, + Vec, + Vec, +) { + let (before_packing, after_packing, after_ms) = encrypt_br_dp_packing_ks_ms_inner_helper_gpu( + params, + comp_params, + single_cks, + single_cuda_sks, + single_compression_private_key, + single_cuda_compression_key, + msg, + streams, + ); + + ( + before_packing + .into_iter() + .map(|(input, after_pbs, after_dp)| { + ( + input + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_pbs + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_dp + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + ) + }) + .collect(), + after_packing + .into_iter() + .map(|x| { + x.get_noise_if_decryption_was_correct() + .expect("Decryption Failed") + }) + .collect(), + after_ms + .into_iter() + .map(|x| { + x.get_noise_if_decryption_was_correct() + .expect("Decryption Failed") + }) + .collect(), + ) +} +#[allow(clippy::type_complexity, clippy::too_many_arguments)] +fn encrypt_br_dp_packing_ks_ms_pfail_helper_gpu( + params: AtomicPatternParameters, + comp_params: CompressionParameters, + single_cks: &ClientKey, + single_cuda_sks: &CudaServerKey, + single_compression_private_key: &CompressionPrivateKeys, + single_cuda_compression_key: &CudaCompressionKey, + msg: u64, + streams: &CudaStreams, +) -> Vec { + let (_before_packing, _after_packing, after_ms) = encrypt_br_dp_packing_ks_ms_inner_helper_gpu( + params, + comp_params, + single_cks, + single_cuda_sks, + single_compression_private_key, + single_cuda_compression_key, + msg, + streams, + ); + + after_ms +} + +fn noise_check_encrypt_br_dp_packing_ks_ms_noise_gpu(meta_params: MetaParameters) { + let (params, comp_params) = ( + meta_params.compute_parameters, + meta_params.compression_parameters.unwrap(), + ); + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let block_params: ShortintParameterSet = params.into(); + let cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); + let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); + + let private_compression_key = cks.new_compression_private_key(comp_params); + let (compressed_compression_key, _compressed_decompression_key) = + cks.new_compressed_compression_decompression_keys(&private_compression_key); + let compression_key = compressed_compression_key.decompress(); + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams); + + let noise_simulation_bsk = + NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params); + let noise_simulation_packing_key = + NoiseSimulationLwePackingKeyswitchKey::new_from_comp_parameters(params, comp_params); + + assert!(noise_simulation_bsk.matches_actual_bsk_gpu(&cuda_sks.bootstrapping_key)); + assert!(noise_simulation_packing_key.matches_actual_shortint_comp_key(&compression_key.key)); + + // The multiplication done in the compression is made to move the message up at the top of the + // carry space, multiplying by the carry modulus achieves that + let dp_scalar = params.carry_modulus().0; + + let noise_simulation_accumulator = NoiseSimulationGlwe::new( + noise_simulation_bsk.output_glwe_size().to_glwe_dimension(), + noise_simulation_bsk.output_polynomial_size(), + Variance(0.0), + noise_simulation_bsk.modulus(), + ); + + let lwe_per_glwe = cuda_compression_key.lwe_per_glwe; + let storage_modulus_log = cuda_compression_key.storage_log_modulus; + let br_input_modulus_log = cuda_sks.br_input_modulus_log(); + + let (_before_packing_sim, _after_packing_sim, after_ms_sim) = { + let noise_simulation = NoiseSimulationLwe::new( + cks.parameters().lwe_dimension(), + Variance(0.0), + NoiseSimulationModulus::from_ciphertext_modulus(cks.parameters().ciphertext_modulus()), + ); + br_dp_packing_ks_ms( + vec![noise_simulation; lwe_per_glwe.0], + &noise_simulation_bsk, + &noise_simulation_accumulator, + dp_scalar, + &noise_simulation_packing_key, + storage_modulus_log, + &mut vec![(); lwe_per_glwe.0], + ) + }; + + let input_zeros: Vec<_> = (0..lwe_per_glwe.0) + .map(|_| { + cks.key + .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0) + }) + .collect(); + + let d_input_zeros: Vec<_> = input_zeros + .iter() + .map(|ct| { + let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), &streams); + CudaDynLwe::U64(d_ct_input) + }) + .collect(); + + let id_lut = cuda_sks.generate_lookup_table(|x| x); + let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams); + + let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo { + degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1), + message_modulus: params.message_modulus(), + carry_modulus: params.carry_modulus(), + atomic_pattern: params.atomic_pattern(), + noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL, + }; + let mut cuda_side_resources: Vec = (0..input_zeros.len()) + .map(|_| CudaSideResources::new(&streams, cuda_block_info)) + .collect(); + + // Check that the circuit is correct with respect to core implementation, i.e. does not crash on + // dimension checks + let (expected_glwe_size_out, expected_polynomial_size_out, expected_modulus_f64_out) = { + let (_before_packing_sim, _after_packing, after_ms) = br_dp_packing_ks_ms( + d_input_zeros, + &cuda_sks, + &d_accumulator, + dp_scalar, + &cuda_compression_key.packing_key_switching_key, + storage_modulus_log, + &mut cuda_side_resources, + ); + + ( + after_ms.glwe_dimension().to_glwe_size(), + after_ms.polynomial_size(), + after_ms.ciphertext_modulus().raw_modulus_float(), + ) + }; + + assert_eq!(after_ms_sim.glwe_size(), expected_glwe_size_out); + assert_eq!(after_ms_sim.polynomial_size(), expected_polynomial_size_out); + assert_eq!(after_ms_sim.modulus().as_f64(), expected_modulus_f64_out); + + let cleartext_modulus = params.message_modulus().0 * params.carry_modulus().0; + let mut noise_samples_before_ms = vec![]; + let mut noise_samples_after_ms = vec![]; + + let sample_count_per_msg = 1000usize; + let chunk_size = 8; + let vec_local_streams = (0..chunk_size) + .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index))) + .collect::>(); + for _ in 0..cleartext_modulus { + let (current_noise_samples_before_ms, current_noise_samples_after_ms): (Vec<_>, Vec<_>) = + (0..sample_count_per_msg) + .collect::>() + .chunks(chunk_size) + .flat_map(|chunk| { + chunk + .into_par_iter() + .map(|i| { + let local_stream = &vec_local_streams[*i % chunk_size]; + let (_before_packing, after_packing, after_ms) = + encrypt_br_dp_packing_ks_ms_noise_helper_gpu( + params, + comp_params, + &cks, + &cuda_sks, + &private_compression_key, + &cuda_compression_key, + 0, + local_stream, + ); + (after_packing, after_ms) + }) + .collect::>() + }) + .unzip(); + + noise_samples_before_ms.extend(current_noise_samples_before_ms); + noise_samples_after_ms.extend(current_noise_samples_after_ms); + } + + let noise_samples_before_ms_flattened: Vec<_> = noise_samples_before_ms + .into_iter() + .flatten() + .map(|x| x.value) + .collect(); + + let noise_samples_after_ms_flattened: Vec<_> = noise_samples_after_ms + .into_iter() + .flatten() + .map(|x| x.value) + .collect(); + + let before_ms_normality = + normality_check(&noise_samples_before_ms_flattened, "before ms", 0.01); + + let after_ms_is_ok = mean_and_variance_check( + &noise_samples_after_ms_flattened, + "after_ms", + 0.0, + after_ms_sim.variance_per_occupied_slot(), + comp_params.packing_ks_key_noise_distribution(), + after_ms_sim + .glwe_dimension() + .to_equivalent_lwe_dimension(after_ms_sim.polynomial_size()), + after_ms_sim.modulus().as_f64(), + ); + + assert!(before_ms_normality.null_hypothesis_is_valid && after_ms_is_ok); +} +create_gpu_parameterized_test!(noise_check_encrypt_br_dp_packing_ks_ms_noise_gpu { + TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128, +}); + +fn noise_check_encrypt_br_dp_packing_ks_ms_pfail_gpu(meta_params: MetaParameters) { + let (pfail_test_meta, params, comp_params) = { + let (mut params, comp_params) = ( + meta_params.compute_parameters, + meta_params.compression_parameters.unwrap(), + ); + + let original_message_modulus = params.message_modulus(); + let original_carry_modulus = params.carry_modulus(); + + // For now only allow 2_2 parameters, and see later for heuristics to use + assert_eq!(original_message_modulus.0, 4); + assert_eq!(original_carry_modulus.0, 4); + + let noise_simulation_bsk = + NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params); + let noise_simulation_packing_key = + NoiseSimulationLwePackingKeyswitchKey::new_from_comp_parameters(params, comp_params); + + // The multiplication done in the compression is made to move the message up at the top of + // the carry space, multiplying by the carry modulus achieves that + let dp_scalar = params.carry_modulus().0; + + let noise_simulation_accumulator = NoiseSimulationGlwe::new( + noise_simulation_bsk.output_glwe_size().to_glwe_dimension(), + noise_simulation_bsk.output_polynomial_size(), + Variance(0.0), + noise_simulation_bsk.modulus(), + ); + + let lwe_per_glwe = comp_params.lwe_per_glwe(); + let storage_modulus_log = comp_params.storage_log_modulus(); + + let (_before_packing_sim, _after_packing_sim, after_ms_sim) = { + let noise_simulation = NoiseSimulationLwe::new( + params.lwe_dimension(), + Variance(0.0), + NoiseSimulationModulus::from_ciphertext_modulus(params.ciphertext_modulus()), + ); + br_dp_packing_ks_ms( + vec![noise_simulation; lwe_per_glwe.0], + &noise_simulation_bsk, + &noise_simulation_accumulator, + dp_scalar, + &noise_simulation_packing_key, + storage_modulus_log, + &mut vec![(); lwe_per_glwe.0], + ) + }; + + let expected_variance_after_storage = after_ms_sim.variance_per_occupied_slot(); + + let compression_carry_mod = CarryModulus(1); + let compression_message_mod = original_message_modulus; + let compression_precision_with_padding = + precision_with_padding(compression_message_mod, compression_carry_mod); + let expected_pfail_for_storage = expected_pfail_for_precision( + compression_precision_with_padding, + expected_variance_after_storage, + ); + + let original_pfail_and_precision = PfailAndPrecision::new( + expected_pfail_for_storage, + compression_message_mod, + compression_carry_mod, + ); + + // Here we update the message modulus only: + // - because the message modulus matches for the compression encoding and compute encoding + // - so that the carry modulus stays the same and we apply the same dot product as normal + // for 2_2 + // - so that the effective encoding after the storage is the one we used to evaluate the + // pfail + let updated_message_mod = MessageModulus(1 << 6); + let updated_carry_mod = compression_carry_mod; + + update_ap_params_msg_and_carry_moduli(&mut params, updated_message_mod, updated_carry_mod); + + assert!( + (params.message_modulus().0 * params.carry_modulus().0).ilog2() + <= comp_params.storage_log_modulus().0 as u32, + "Compression storage modulus cannot store enough bits for pfail estimation" + ); + + let updated_precision_with_padding = + precision_with_padding(updated_message_mod, updated_carry_mod); + + let new_expected_pfail_for_storage = expected_pfail_for_precision( + updated_precision_with_padding, + expected_variance_after_storage, + ); + + let new_expected_pfail_and_precision = PfailAndPrecision::new( + new_expected_pfail_for_storage, + updated_message_mod, + updated_carry_mod, + ); + + let pfail_test_meta = if should_run_short_pfail_tests_debug() { + // To have the same amount of keys generated as the case where a single run is a single + // sample + let expected_fails = 200 * lwe_per_glwe.0 as u32; + PfailTestMeta::new_with_desired_expected_fails( + original_pfail_and_precision, + new_expected_pfail_and_precision, + expected_fails, + ) + } else { + // To guarantee 1_000_000 keysets are generated + let total_runs = 1_000_000 * lwe_per_glwe.0 as u32; + PfailTestMeta::new_with_total_runs( + original_pfail_and_precision, + new_expected_pfail_and_precision, + total_runs, + ) + }; + + (pfail_test_meta, params, comp_params) + }; + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let block_params: ShortintParameterSet = params.into(); + let cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); + let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); + + let private_compression_key = cks.new_compression_private_key(comp_params); + let (compressed_compression_key, _compressed_decompression_key) = + cks.new_compressed_compression_decompression_keys(&private_compression_key); + + let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams); + + let lwe_per_glwe = cuda_compression_key.lwe_per_glwe; + + let total_runs_for_expected_fails = pfail_test_meta + .total_runs_for_expected_fails() + .div_ceil(lwe_per_glwe.0.try_into().unwrap()); + + let chunk_size = 8; + let vec_local_streams = (0..chunk_size) + .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index))) + .collect::>(); + + let measured_fails: f64 = (0..total_runs_for_expected_fails) + .collect::>() + .chunks(chunk_size) + .flat_map(|chunk| { + chunk + .into_par_iter() + .map(|i| { + let local_streams = &vec_local_streams[*i as usize % chunk_size]; + let after_ms_decryption_result = encrypt_br_dp_packing_ks_ms_pfail_helper_gpu( + params, + comp_params, + &cks, + &cuda_sks, + &private_compression_key, + &cuda_compression_key, + 0, + local_streams, + ); + after_ms_decryption_result + .into_iter() + .map(|result| result.failure_as_f64()) + .sum::() + }) + .collect::>() + }) + .sum(); + + let test_result = PfailTestResult { measured_fails }; + + pfail_check(&pfail_test_meta, test_result); +} + +create_gpu_parameterized_test!(noise_check_encrypt_br_dp_packing_ks_ms_pfail_gpu { + TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128, +}); diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs new file mode 100644 index 000000000..c04c8238d --- /dev/null +++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs @@ -0,0 +1,872 @@ +use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources}; +use crate::core_crypto::commons::noise_formulas::noise_simulation::{ + NoiseSimulationLweFourier128Bsk, NoiseSimulationLwePackingKeyswitchKey, +}; +use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; +use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::prelude::{GlweCiphertext, LweCiphertextCount}; +use crate::integer::gpu::CudaServerKey; +use crate::integer::noise_squashing::NoiseSquashingPrivateKey; +use crate::integer::CompressedServerKey; + +use crate::core_crypto::commons::parameters::CiphertextModulusLog; +use crate::core_crypto::prelude::generate_programmable_bootstrap_glwe_lut; +use crate::integer::ciphertext::NoiseSquashingCompressionPrivateKey; +use crate::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey; +use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test; +use crate::integer::gpu::server_key::radix::{CudaNoiseSquashingKey, CudaUnsignedRadixCiphertext}; +use crate::integer::gpu::unchecked_small_scalar_mul_integer_async; +use crate::integer::IntegerCiphertext; +use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey; +use crate::shortint::parameters::noise_squashing::NoiseSquashingParameters; +use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128; +use crate::shortint::parameters::{ + AtomicPatternParameters, MetaParameters, NoiseSquashingCompressionParameters, Variance, +}; +use crate::shortint::server_key::tests::noise_distribution::dp_ks_pbs128_packingks::{ + dp_ks_any_ms_standard_pbs128, dp_ks_any_ms_standard_pbs128_packing_ks, +}; +use crate::shortint::server_key::tests::noise_distribution::should_use_single_key_debug; +use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{ + NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk, + NoiseSimulationLweKeyswitchKey, NoiseSimulationModulusSwitchConfig, +}; +use crate::shortint::server_key::tests::noise_distribution::utils::{ + mean_and_variance_check, DecryptionAndNoiseResult, NoiseSample, +}; +use crate::shortint::{PaddingBit, ShortintEncoding, ShortintParameterSet}; +use crate::GpuIndex; +use rayon::prelude::*; + +/// Test function to verify that the noise checking tools match the actual atomic patterns +/// implemented in shortint for GPU +fn sanity_check_encrypt_dp_ks_standard_pbs128_packing_ks_gpu(meta_params: MetaParameters) { + let (atomic_params, noise_squashing_params, noise_squashing_compression_params) = { + let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap(); + ( + meta_params.compute_parameters, + meta_noise_squashing_params.parameters, + meta_noise_squashing_params.compression_parameters.unwrap(), + ) + }; + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let block_params: ShortintParameterSet = atomic_params.into(); + let cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); + let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); + + let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params); + let compressed_noise_squashing_compression_key = + cks.new_compressed_noise_squashing_key(&noise_squashing_private_key); + let noise_squashing_key = compressed_noise_squashing_compression_key.decompress(); + let cuda_noise_squashing_key = + compressed_noise_squashing_compression_key.decompress_to_cuda(&streams); + let noise_squashing_compression_private_key = + NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params); + let noise_squashing_compression_key = noise_squashing_private_key + .new_noise_squashing_compression_key(&noise_squashing_compression_private_key); + let cuda_noise_squashing_compression_key = + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &noise_squashing_compression_key, + &streams, + ); + + let lwe_per_glwe = cuda_noise_squashing_compression_key.lwe_per_glwe; + + let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config(); + + let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log(); + + let u128_encoding = ShortintEncoding { + ciphertext_modulus: noise_squashing_params.ciphertext_modulus(), + message_modulus: noise_squashing_params.message_modulus(), + carry_modulus: noise_squashing_params.carry_modulus(), + padding_bit: PaddingBit::Yes, + }; + let max_scalar_mul = cuda_sks.max_noise_level.get(); + + let id_lut_cpu = generate_programmable_bootstrap_glwe_lut( + noise_squashing_key.key.polynomial_size(), + noise_squashing_key.key.glwe_size(), + u128_encoding + .cleartext_space_without_padding() + .try_into() + .unwrap(), + u128_encoding.ciphertext_modulus, + u128_encoding.delta(), + |x| x, + ); + + let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, &streams); + + let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(0)).collect(); + + let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo { + degree: crate::shortint::ciphertext::Degree::new(atomic_params.message_modulus().0 - 1), + message_modulus: atomic_params.message_modulus(), + carry_modulus: atomic_params.carry_modulus(), + atomic_pattern: atomic_params.atomic_pattern(), + noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL, + }; + let mut cuda_side_resources: Vec = (0..input_zeros.len()) + .map(|_| CudaSideResources::new(&streams, cuda_block_info)) + .collect(); + + let input_zero_as_lwe: Vec<_> = input_zeros + .iter() + .map(|ct| { + let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext( + &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]), + &streams, + ); + CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks) + }) + .collect(); + + let (_before_packing, d_after_packing) = dp_ks_any_ms_standard_pbs128_packing_ks( + input_zero_as_lwe, + max_scalar_mul, + &cuda_sks, + modulus_switch_config, + &cuda_noise_squashing_key, + br_input_modulus_log, + &id_lut_gpu, + &cuda_noise_squashing_compression_key.packing_key_switching_key, + &mut cuda_side_resources, + ); + + let cuda_noise_squashed_cts: Vec<_> = input_zeros + .into_par_iter() + .map(|ct| { + let cloned_ct = ct; + let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cloned_ct]); + let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&radix_ct, &streams); + unsafe { + unchecked_small_scalar_mul_integer_async( + &streams, + &mut d_ct.ciphertext, + max_scalar_mul, + atomic_params.message_modulus(), + atomic_params.carry_modulus(), + ); + } + streams.synchronize(); + cuda_noise_squashing_key.unchecked_squash_ciphertext_noise( + &d_ct.ciphertext, + &cuda_sks, + &streams, + ) + }) + .collect(); + + let gpu_compressed = cuda_noise_squashing_compression_key + .compress_noise_squashed_ciphertexts_into_list(&cuda_noise_squashed_cts, &streams); + + let gpu_extracted = gpu_compressed.extract_glwe(0, &streams); + let extracted_list = gpu_extracted.to_glwe_ciphertext_list(&streams); + let extracted_glwe = GlweCiphertext::from_container( + extracted_list.clone().into_container(), + extracted_list.polynomial_size(), + extracted_list.ciphertext_modulus(), + ); + + let after_packing_list = d_after_packing.to_glwe_ciphertext_list(&streams); + let mut after_packing = GlweCiphertext::from_container( + after_packing_list.clone().into_container(), + after_packing_list.polynomial_size(), + after_packing_list.ciphertext_modulus(), + ); + // Bodies that were not filled are discarded + after_packing.get_mut_body().as_mut()[lwe_per_glwe.0..].fill(0); + + assert_eq!(after_packing.as_view(), extracted_glwe.as_view()); +} + +/// Test function to verify that the noise checking tools match the actual atomic patterns +/// implemented in shortint for GPU +fn sanity_check_encrypt_dp_ks_standard_pbs128_gpu(meta_params: MetaParameters) { + let (params, noise_squashing_params) = { + let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap(); + ( + meta_params.compute_parameters, + meta_noise_squashing_params.parameters, + ) + }; + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let block_params: ShortintParameterSet = params.into(); + let cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); + let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); + + let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params); + let compressed_noise_squashing_compression_key = + cks.new_compressed_noise_squashing_key(&noise_squashing_private_key); + let noise_squashing_key = compressed_noise_squashing_compression_key.decompress(); + let cuda_noise_squashing_key = + compressed_noise_squashing_compression_key.decompress_to_cuda(&streams); + + let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config(); + + let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log(); + + let u128_encoding = ShortintEncoding { + ciphertext_modulus: noise_squashing_params.ciphertext_modulus(), + message_modulus: noise_squashing_params.message_modulus(), + carry_modulus: noise_squashing_params.carry_modulus(), + padding_bit: PaddingBit::Yes, + }; + let max_scalar_mul = cuda_sks.max_noise_level.get(); + + let id_lut_cpu = generate_programmable_bootstrap_glwe_lut( + noise_squashing_key.key.polynomial_size(), + noise_squashing_key.key.glwe_size(), + u128_encoding + .cleartext_space_without_padding() + .try_into() + .unwrap(), + u128_encoding.ciphertext_modulus, + u128_encoding.delta(), + |x| x, + ); + + let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, &streams); + + let lwe_per_glwe = LweCiphertextCount(128); + let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(0)).collect(); + + let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo { + degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1), + message_modulus: params.message_modulus(), + carry_modulus: params.carry_modulus(), + atomic_pattern: params.atomic_pattern(), + noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL, + }; + let mut cuda_side_resources: Vec = (0..input_zeros.len()) + .map(|_| CudaSideResources::new(&streams, cuda_block_info)) + .collect(); + + let input_zero_as_lwe: Vec<_> = input_zeros + .iter() + .map(|ct| { + let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext( + &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]), + &streams, + ); + CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks) + }) + .collect(); + + let res: Vec<_> = input_zero_as_lwe + .into_par_iter() + .zip(cuda_side_resources.par_iter_mut()) + .map(|(input, side_resources)| { + let (input, after_dp, ks_result, drift_technique_result, ms_result, pbs_result) = + dp_ks_any_ms_standard_pbs128( + input, + max_scalar_mul, + &cuda_sks, + modulus_switch_config, + &cuda_noise_squashing_key, + br_input_modulus_log, + &id_lut_gpu, + side_resources, + ); + + ( + input, + after_dp, + ks_result, + drift_technique_result, + ms_result, + pbs_result, + ) + }) + .collect(); + + let input_zeros_non_pattern: Vec<_> = input_zeros + .iter() + .map(|ct| { + CudaUnsignedRadixCiphertext::from_radix_ciphertext( + &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]), + &streams, + ) + }) + .collect(); + + let vector_non_pattern: Vec<_> = input_zeros_non_pattern + .into_par_iter() + .map(|mut d_ct_input2| { + unsafe { + unchecked_small_scalar_mul_integer_async( + &streams, + &mut d_ct_input2.ciphertext, + max_scalar_mul, + params.message_modulus(), + params.carry_modulus(), + ); + } + + streams.synchronize(); + + cuda_noise_squashing_key + .squash_radix_ciphertext_noise(&cuda_sks, &d_ct_input2.ciphertext, &streams) + .unwrap() + }) + .collect(); + + let vector_pattern_cpu: Vec<_> = res + .into_iter() + .map( + |(_input, _after_dp, _ks_result, _drift_technique_result, _ms_result, pbs_result)| { + pbs_result.as_ct_128_cpu(&streams) + }, + ) + .collect(); + + let vector_non_pattern_cpu: Vec<_> = vector_non_pattern + .into_par_iter() + .map(|cuda_squashed_radix_ct| { + let squashed_noise_ct_cpu = + cuda_squashed_radix_ct.to_squashed_noise_radix_ciphertext(&streams); + squashed_noise_ct_cpu.packed_blocks()[0] + .lwe_ciphertext() + .clone() + }) + .collect(); + + // Compare that all the results are equivalent + assert_eq!(vector_pattern_cpu.len(), vector_non_pattern_cpu.len()); + for (a, b) in vector_pattern_cpu.iter().zip(vector_non_pattern_cpu.iter()) { + assert_eq!(a.as_view(), b.as_view()); + } +} + +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] +fn encrypt_dp_ks_standard_pbs128_packing_ks_inner_helper_gpu( + params: AtomicPatternParameters, + noise_squashing_params: NoiseSquashingParameters, + noise_squashing_compression_params: NoiseSquashingCompressionParameters, + single_cks: &crate::integer::ClientKey, + single_cuda_sks: &CudaServerKey, + single_noise_squashing_private_key: &NoiseSquashingPrivateKey, + single_noise_squashing_key: &crate::integer::noise_squashing::NoiseSquashingKey, + single_cuda_noise_squashing_key: &CudaNoiseSquashingKey, + single_noise_squashing_compression_private_key: &NoiseSquashingCompressionPrivateKey, + single_cuda_noise_squashing_compression_key: &CudaNoiseSquashingCompressionKey, + msg: u64, + scalar_for_multiplication: u64, + br_input_modulus_log: CiphertextModulusLog, + streams: &CudaStreams, +) -> ( + Vec<( + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + DecryptionAndNoiseResult, + )>, + Vec, +) { + let thread_cks: crate::integer::ClientKey; + let thread_cuda_sks: CudaServerKey; + let thread_noise_squashing_private_key: NoiseSquashingPrivateKey; + let thread_noise_squashing_key: crate::integer::noise_squashing::NoiseSquashingKey; + let thread_cuda_noise_squashing_key: CudaNoiseSquashingKey; + let thread_noise_squashing_compression_private_key: NoiseSquashingCompressionPrivateKey; + let thread_cuda_noise_squashing_compression_key: CudaNoiseSquashingCompressionKey; + let ( + cks, + cuda_sks, + noise_squashing_private_key, + noise_squashing_key, + cuda_noise_squashing_key, + noise_squashing_compression_private_key, + cuda_noise_squashing_compression_key, + ) = if should_use_single_key_debug() { + ( + single_cks, + single_cuda_sks, + single_noise_squashing_private_key, + single_noise_squashing_key, + single_cuda_noise_squashing_key, + single_noise_squashing_compression_private_key, + single_cuda_noise_squashing_compression_key, + ) + } else { + let block_params: ShortintParameterSet = params.into(); + thread_cks = crate::integer::ClientKey::new(block_params); + let thread_compressed_server_key = + CompressedServerKey::new_radix_compressed_server_key(&thread_cks); + thread_cuda_sks = + CudaServerKey::decompress_from_cpu(&thread_compressed_server_key, streams); + + thread_noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params); + let thread_compressed_noise_squashing_compression_key = + thread_cks.new_compressed_noise_squashing_key(&thread_noise_squashing_private_key); + thread_noise_squashing_key = thread_compressed_noise_squashing_compression_key.decompress(); + thread_cuda_noise_squashing_key = + thread_compressed_noise_squashing_compression_key.decompress_to_cuda(streams); + thread_noise_squashing_compression_private_key = + NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params); + let thread_noise_squashing_compression_key = thread_noise_squashing_private_key + .new_noise_squashing_compression_key(&thread_noise_squashing_compression_private_key); + thread_cuda_noise_squashing_compression_key = + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &thread_noise_squashing_compression_key, + streams, + ); + ( + &thread_cks, + &thread_cuda_sks, + &thread_noise_squashing_private_key, + &thread_noise_squashing_key, + &thread_cuda_noise_squashing_key, + &thread_noise_squashing_compression_private_key, + &thread_cuda_noise_squashing_compression_key, + ) + }; + + let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config(); + + let bsk_polynomial_size = noise_squashing_key.key.polynomial_size(); + let bsk_glwe_size = noise_squashing_key.key.glwe_size(); + + let u128_encoding = ShortintEncoding { + ciphertext_modulus: noise_squashing_params.ciphertext_modulus(), + message_modulus: noise_squashing_params.message_modulus(), + carry_modulus: noise_squashing_params.carry_modulus(), + padding_bit: PaddingBit::Yes, + }; + + let id_lut_cpu = generate_programmable_bootstrap_glwe_lut( + bsk_polynomial_size, + bsk_glwe_size, + u128_encoding + .cleartext_space_without_padding() + .try_into() + .unwrap(), + u128_encoding.ciphertext_modulus, + u128_encoding.delta(), + |x| x, + ); + let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, streams); + + let lwe_per_glwe = cuda_noise_squashing_compression_key.lwe_per_glwe; + + let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(msg)).collect(); + + let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo { + degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1), + message_modulus: params.message_modulus(), + carry_modulus: params.carry_modulus(), + atomic_pattern: params.atomic_pattern(), + noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL, + }; + let mut cuda_side_resources: Vec = (0..input_zeros.len()) + .map(|_| CudaSideResources::new(streams, cuda_block_info)) + .collect(); + + let input_zero_as_lwe: Vec<_> = input_zeros + .iter() + .map(|ct| { + let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext( + &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]), + streams, + ); + CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks) + }) + .collect(); + + let (before_packing_gpu, after_packing_gpu) = dp_ks_any_ms_standard_pbs128_packing_ks( + input_zero_as_lwe, + scalar_for_multiplication, + cuda_sks, + modulus_switch_config, + cuda_noise_squashing_key, + br_input_modulus_log, + &id_lut_gpu, + &cuda_noise_squashing_compression_key.packing_key_switching_key, + &mut cuda_side_resources, + ); + + let before_packing: Vec<_> = before_packing_gpu + .into_iter() + .map( + |( + input_gpu, + after_dp_gpu, + after_ks_gpu, + after_drift_gpu, + after_ms_gpu, + after_pbs128_gpu, + )| { + match &cks.key.atomic_pattern { + AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => { + let params = standard_atomic_pattern_client_key.parameters; + let u64_encoding = ShortintEncoding { + ciphertext_modulus: params.ciphertext_modulus(), + message_modulus: params.message_modulus(), + carry_modulus: params.carry_modulus(), + padding_bit: PaddingBit::Yes, + }; + let large_lwe_secret_key = + standard_atomic_pattern_client_key.large_lwe_secret_key(); + let small_lwe_secret_key = + standard_atomic_pattern_client_key.small_lwe_secret_key(); + + let input_ct = input_gpu.as_ct_64_cpu(streams); + let after_dp_ct = after_dp_gpu.as_ct_64_cpu(streams); + let after_ks_ct = after_ks_gpu.as_ct_64_cpu(streams); + let before_ms_gpu: &CudaDynLwe = + after_drift_gpu.as_ref().unwrap_or(&after_ks_gpu); + let before_ms_ct = before_ms_gpu.as_ct_64_cpu(streams); + let after_ms_ct = after_ms_gpu.as_ct_64_cpu(streams); + let after_pbs128_ct = after_pbs128_gpu.as_ct_128_cpu(streams); + ( + DecryptionAndNoiseResult::new_from_lwe( + &input_ct, + &large_lwe_secret_key, + msg, + &u64_encoding, + ), + DecryptionAndNoiseResult::new_from_lwe( + &after_dp_ct, + &large_lwe_secret_key, + msg, + &u64_encoding, + ), + DecryptionAndNoiseResult::new_from_lwe( + &after_ks_ct, + &small_lwe_secret_key, + msg, + &u64_encoding, + ), + DecryptionAndNoiseResult::new_from_lwe( + &before_ms_ct, + &small_lwe_secret_key, + msg, + &u64_encoding, + ), + DecryptionAndNoiseResult::new_from_lwe( + &after_ms_ct, + &small_lwe_secret_key, + msg, + &u64_encoding, + ), + DecryptionAndNoiseResult::new_from_lwe( + &after_pbs128_ct, + &noise_squashing_private_key + .key + .post_noise_squashing_lwe_secret_key(), + msg.into(), + &u128_encoding, + ), + ) + } + AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => { + panic!("KS32 atomic pattern not supported for GPU yet"); + } + } + }, + ) + .collect(); + let after_packing_list = after_packing_gpu.to_glwe_ciphertext_list(streams); + let after_packing = GlweCiphertext::from_container( + after_packing_list.clone().into_container(), + after_packing_list.polynomial_size(), + after_packing_list.ciphertext_modulus(), + ); + let after_packing = DecryptionAndNoiseResult::new_from_glwe( + &after_packing, + noise_squashing_compression_private_key + .key + .post_packing_ks_key(), + lwe_per_glwe, + msg.into(), + &u128_encoding, + ); + + assert_eq!(after_packing.len(), lwe_per_glwe.0); + + (before_packing, after_packing) +} + +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] +fn encrypt_dp_ks_standard_pbs128_packing_ks_noise_helper_gpu( + params: AtomicPatternParameters, + noise_squashing_params: NoiseSquashingParameters, + noise_squashing_compression_params: NoiseSquashingCompressionParameters, + single_cks: &crate::integer::ClientKey, + single_cuda_sks: &CudaServerKey, + single_noise_squashing_private_key: &NoiseSquashingPrivateKey, + single_noise_squashing_key: &crate::integer::noise_squashing::NoiseSquashingKey, + single_cuda_noise_squashing_key: &CudaNoiseSquashingKey, + single_noise_squashing_compression_private_key: &NoiseSquashingCompressionPrivateKey, + single_cuda_noise_squashing_compression_key: &CudaNoiseSquashingCompressionKey, + msg: u64, + scalar_for_multiplication: u64, + br_input_modulus_log: CiphertextModulusLog, + streams: &CudaStreams, +) -> ( + Vec<( + NoiseSample, + NoiseSample, + NoiseSample, + NoiseSample, + NoiseSample, + NoiseSample, + )>, + Vec, +) { + let (before_compression, after_compression) = + encrypt_dp_ks_standard_pbs128_packing_ks_inner_helper_gpu( + params, + noise_squashing_params, + noise_squashing_compression_params, + single_cks, + single_cuda_sks, + single_noise_squashing_private_key, + single_noise_squashing_key, + single_cuda_noise_squashing_key, + single_noise_squashing_compression_private_key, + single_cuda_noise_squashing_compression_key, + msg, + scalar_for_multiplication, + br_input_modulus_log, + streams, + ); + + ( + before_compression + .into_iter() + .map( + |(input, after_dp, after_ks, after_drift, after_ms, after_pbs)| { + ( + input + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_dp + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_ks + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_drift + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_ms + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + after_pbs + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed"), + ) + }, + ) + .collect(), + after_compression + .into_iter() + .map(|after_compression| { + after_compression + .get_noise_if_decryption_was_correct() + .expect("Decryption Failed") + }) + .collect(), + ) +} + +fn noise_check_encrypt_dp_ks_standard_pbs128_packing_ks_noise_gpu(meta_params: MetaParameters) { + let (atomic_params, noise_squashing_params, noise_squashing_compression_params) = { + let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap(); + ( + meta_params.compute_parameters, + meta_noise_squashing_params.parameters, + meta_noise_squashing_params.compression_parameters.unwrap(), + ) + }; + let gpu_index = 0; + let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)); + + let block_params: ShortintParameterSet = atomic_params.into(); + let cks = crate::integer::ClientKey::new(block_params); + let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks); + let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams); + + let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params); + let compressed_noise_squashing_compression_key = + cks.new_compressed_noise_squashing_key(&noise_squashing_private_key); + let noise_squashing_key = compressed_noise_squashing_compression_key.decompress(); + let cuda_noise_squashing_key = + compressed_noise_squashing_compression_key.decompress_to_cuda(&streams); + let noise_squashing_compression_private_key = + NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params); + let noise_squashing_compression_key = noise_squashing_private_key + .new_noise_squashing_compression_key(&noise_squashing_compression_private_key); + let cuda_noise_squashing_compression_key = + CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key( + &noise_squashing_compression_key, + &streams, + ); + + let noise_simulation_ksk = + NoiseSimulationLweKeyswitchKey::new_from_atomic_pattern_parameters(atomic_params); + let noise_simulation_bsk = + NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(atomic_params); + let noise_simulation_modulus_switch_config = + NoiseSimulationModulusSwitchConfig::new_from_atomic_pattern_parameters(atomic_params); + let noise_simulation_bsk128 = + NoiseSimulationLweFourier128Bsk::new_from_parameters(atomic_params, noise_squashing_params); + let noise_simulation_packing_key = + NoiseSimulationLwePackingKeyswitchKey::new_from_noise_squashing_parameters( + noise_squashing_params, + noise_squashing_compression_params, + ); + + assert!(noise_simulation_bsk.matches_actual_bsk_gpu(&cuda_sks.bootstrapping_key)); + + assert!(noise_simulation_bsk128 + .matches_actual_shortint_noise_squashing_key(&noise_squashing_key.key)); + assert!(noise_simulation_packing_key.matches_actual_pksk( + noise_squashing_compression_key + .key + .packing_key_switching_key() + )); + + let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log(); + + let max_scalar_mul = cuda_sks.max_noise_level.get(); + + let noise_simulation_accumulator = NoiseSimulationGlwe::new( + noise_simulation_bsk128 + .output_glwe_size() + .to_glwe_dimension(), + noise_simulation_bsk128.output_polynomial_size(), + Variance(0.0), + noise_simulation_bsk128.modulus(), + ); + + let (_before_packing_sim, after_packing_sim) = { + let noise_simulation = NoiseSimulationLwe::encrypt(&cks.key, 0); + dp_ks_any_ms_standard_pbs128_packing_ks( + vec![noise_simulation; cuda_noise_squashing_compression_key.lwe_per_glwe.0], + max_scalar_mul, + &noise_simulation_ksk, + noise_simulation_modulus_switch_config.as_ref(), + &noise_simulation_bsk128, + br_input_modulus_log, + &noise_simulation_accumulator, + &noise_simulation_packing_key, + &mut vec![(); cuda_noise_squashing_compression_key.lwe_per_glwe.0], + ) + }; + + let after_packing_sim = after_packing_sim.into_lwe(); + + // Check that the circuit is correct with respect to core implementation, i.e. does not crash on + // dimension checks + let (expected_lwe_dimension_out, expected_modulus_f64_out) = { + let pksk = noise_squashing_compression_key + .key + .packing_key_switching_key(); + + let out_glwe_dim = pksk.output_key_glwe_dimension(); + let out_poly_size = pksk.output_key_polynomial_size(); + + ( + out_glwe_dim.to_equivalent_lwe_dimension(out_poly_size), + pksk.ciphertext_modulus().raw_modulus_float(), + ) + }; + + assert_eq!( + after_packing_sim.lwe_dimension(), + expected_lwe_dimension_out + ); + assert_eq!( + after_packing_sim.modulus().as_f64(), + expected_modulus_f64_out + ); + + let cleartext_modulus = atomic_params.message_modulus().0 * atomic_params.carry_modulus().0; + let mut noise_samples_after_packing = vec![]; + + let sample_count_per_msg = + 1000usize.div_ceil(cuda_noise_squashing_compression_key.lwe_per_glwe.0); + let chunk_size = 4; + let vec_local_streams = (0..chunk_size) + .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index))) + .collect::>(); + for _i in 0..cleartext_modulus { + let current_noise_samples_after_packing: Vec<_> = (0..sample_count_per_msg) + .collect::>() + .chunks(chunk_size) + .flat_map(|chunk| { + chunk + .into_par_iter() + .map(|i| { + let local_stream = &vec_local_streams[*i % chunk_size]; + let (_before_packing, after_packing) = + encrypt_dp_ks_standard_pbs128_packing_ks_noise_helper_gpu( + atomic_params, + noise_squashing_params, + noise_squashing_compression_params, + &cks, + &cuda_sks, + &noise_squashing_private_key, + &noise_squashing_key, + &cuda_noise_squashing_key, + &noise_squashing_compression_private_key, + &cuda_noise_squashing_compression_key, + 0, + max_scalar_mul, + br_input_modulus_log, + local_stream, + ); + after_packing + }) + .collect::>() + }) + .collect(); + + noise_samples_after_packing.extend(current_noise_samples_after_packing); + } + + let noise_samples_after_packing_flattened: Vec<_> = noise_samples_after_packing + .into_iter() + .flatten() + .map(|x| x.value) + .collect(); + + let after_packing_is_ok = mean_and_variance_check( + &noise_samples_after_packing_flattened, + "after_packing", + 0.0, + after_packing_sim.variance(), + noise_squashing_compression_params.packing_ks_key_noise_distribution, + after_packing_sim.lwe_dimension(), + after_packing_sim.modulus().as_f64(), + ); + + assert!(after_packing_is_ok); +} + +create_gpu_parameterized_test!( + noise_check_encrypt_dp_ks_standard_pbs128_packing_ks_noise_gpu { + TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128, + } +); + +create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_standard_pbs128_packing_ks_gpu { + TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128, +}); + +create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_standard_pbs128_gpu { + TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128, +}); diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs index c371e95ea..6ae93d65e 100644 --- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs @@ -1,3 +1,5 @@ pub mod br_dp_ks_ms; +pub mod br_dp_packingks_ms; pub mod dp_ks_ms; +pub mod dp_ks_pbs_128_packingks; pub mod utils; diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs index 200559a3e..52295810e 100644 --- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs +++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs @@ -1,7 +1,7 @@ use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{ AllocateCenteredBinaryShiftedStandardModSwitchResult, AllocateDriftTechniqueStandardModSwitchResult, AllocateLweBootstrapResult, - AllocateLweKeyswitchResult, AllocateStandardModSwitchResult, + AllocateLweKeyswitchResult, AllocateLwePackingKeyswitchResult, AllocateStandardModSwitchResult, CenteredBinaryShiftedStandardModSwitch, DriftTechniqueStandardModSwitch, LweClassicFftBootstrap, LweKeyswitch, ScalarMul, StandardModSwitch, }; @@ -13,6 +13,7 @@ use crate::core_crypto::gpu::cuda_modulus_switch_ciphertext; use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList; use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration; use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList; +use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::prelude::*; use crate::integer::gpu::ciphertext::info::CudaBlockInfo; @@ -25,7 +26,7 @@ use crate::integer::gpu::{ cuda_centered_modulus_switch_64, unchecked_small_scalar_mul_integer_async, CudaStreams, }; use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::NoiseSimulationModulusSwitchConfig; - +use crate::shortint::server_key::tests::noise_distribution::utils::traits::LwePackingKeyswitch; /// Side resources for CUDA operations in noise simulation #[derive(Clone)] pub struct CudaSideResources { @@ -128,6 +129,19 @@ impl CudaDynLwe { } } + pub fn as_ct_128_cpu(&self, streams: &CudaStreams) -> LweCiphertext> { + match self { + Self::U32(_) => panic!("Tried getting a u32 CudaLweCiphertextList as u128."), + Self::U64(_) => panic!("Tried getting a u64 CudaLweCiphertextList as u128."), + Self::U128(_cuda_lwe) => { + let cpu_lwe_list = self.as_lwe_128().to_lwe_ciphertext_list(streams); + LweCiphertext::from_container( + cpu_lwe_list.clone().into_container(), + cpu_lwe_list.ciphertext_modulus(), + ) + } + } + } pub fn from_lwe_32(cuda_lwe: CudaLweCiphertextList) -> Self { Self::U32(cuda_lwe) } @@ -141,6 +155,19 @@ impl CudaDynLwe { } } +/// Converts a CudaGlweCiphertextList to a GlweCiphertext> +pub fn cuda_glwe_list_to_glwe_ciphertext( + cuda_glwe_list: &CudaGlweCiphertextList, + streams: &CudaStreams, +) -> GlweCiphertext> { + let cpu_glwe_list = cuda_glwe_list.to_glwe_ciphertext_list(streams); + GlweCiphertext::from_container( + cpu_glwe_list.clone().into_container(), + cpu_glwe_list.polynomial_size(), + cpu_glwe_list.ciphertext_modulus(), + ) +} + impl ScalarMul for CudaDynLwe { type Output = Self; type SideResources = CudaSideResources; @@ -313,13 +340,14 @@ impl StandardModSwitch for CudaDynLwe { panic!("U32 modulus switch not implemented for CudaDynLwe - only U64 is supported"); } (Self::U64(input), Self::U64(output_cuda_lwe)) => { - let internal_output = input.duplicate(&side_resources.streams); + let mut internal_output = input.duplicate(&side_resources.streams); cuda_modulus_switch_ciphertext( - &mut output_cuda_lwe.0.d_vec, + &mut internal_output.0.d_vec, output_modulus_log.0 as u32, &side_resources.streams, ); let mut cpu_lwe = internal_output.to_lwe_ciphertext_list(&side_resources.streams); + let shift_to_map_to_native = u64::BITS - output_modulus_log.0 as u32; for val in cpu_lwe.as_mut_view().into_container().iter_mut() { *val <<= shift_to_map_to_native; @@ -713,3 +741,193 @@ impl AllocateLweBootstrapResult for CudaGlweCiphertextList { CudaDynLwe::U128(cuda_lwe) } } + +// Implement LweClassicFft128Bootstrap for CudaNoiseSquashingKey using 128-bit PBS CUDA function +impl + crate::core_crypto::commons::noise_formulas::noise_simulation::traits::LweClassicFft128Bootstrap< + CudaDynLwe, + CudaDynLwe, + CudaGlweCiphertextList, + > for crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey +{ + type SideResources = CudaSideResources; + + fn lwe_classic_fft_128_pbs( + &self, + input: &CudaDynLwe, + output: &mut CudaDynLwe, + accumulator: &CudaGlweCiphertextList, + side_resources: &mut Self::SideResources, + ) { + use crate::core_crypto::gpu::algorithms::lwe_programmable_bootstrapping::cuda_programmable_bootstrap_128_lwe_ciphertext_async; + use crate::integer::gpu::server_key::CudaBootstrappingKey; + + match (input, output) { + (CudaDynLwe::U64(input_cuda_lwe), CudaDynLwe::U128(output_cuda_lwe)) => { + // Get the bootstrap key from self - it's already u128 type + let bsk = match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => d_bsk, + CudaBootstrappingKey::MultiBit(_) => { + panic!("MultiBit bootstrapping keys are not supported for 128-bit PBS"); + } + }; + + unsafe { + cuda_programmable_bootstrap_128_lwe_ciphertext_async( + input_cuda_lwe, + output_cuda_lwe, + accumulator, + bsk, + &side_resources.streams, + ); + side_resources.streams.synchronize(); + } + } + _ => panic!("128-bit PBS expects U64 input and U128 output for CudaDynLwe"), + } + } +} + +impl AllocateLwePackingKeyswitchResult for CudaLwePackingKeyswitchKey { + type Output = CudaGlweCiphertextList; + type SideResources = CudaSideResources; + + fn allocate_lwe_packing_keyswitch_result( + &self, + side_resources: &mut Self::SideResources, + ) -> Self::Output { + let glwe_dimension = self.output_glwe_size().to_glwe_dimension(); + let polynomial_size = self.output_polynomial_size(); + let ciphertext_modulus = self.ciphertext_modulus(); + + CudaGlweCiphertextList::new( + glwe_dimension, + polynomial_size, + GlweCiphertextCount(1), + ciphertext_modulus, + &side_resources.streams, + ) + } +} + +impl LwePackingKeyswitch<[&CudaDynLwe], CudaGlweCiphertextList> + for CudaLwePackingKeyswitchKey +{ + type SideResources = CudaSideResources; + + fn keyswitch_lwes_and_pack_in_glwe( + &self, + input: &[&CudaDynLwe], + output: &mut CudaGlweCiphertextList, + side_resources: &mut CudaSideResources, + ) { + use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64; + let input_lwe_ciphertext_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list( + input.iter().map(|ciphertext| ciphertext.as_lwe_64()), + &side_resources.streams, + ); + + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64( + self, + &input_lwe_ciphertext_list, + output, + &side_resources.streams, + ); + } +} + +// Implement StandardModSwitch traits for CudaGlweCiphertextList +impl AllocateStandardModSwitchResult for CudaGlweCiphertextList { + type Output = Self; + type SideResources = CudaSideResources; + + fn allocate_standard_mod_switch_result( + &self, + side_resources: &mut Self::SideResources, + ) -> Self::Output { + Self::new( + self.glwe_dimension(), + self.polynomial_size(), + self.glwe_ciphertext_count(), + self.ciphertext_modulus(), + &side_resources.streams, + ) + } +} + +impl StandardModSwitch for CudaGlweCiphertextList { + type SideResources = CudaSideResources; + + fn standard_mod_switch( + &self, + storage_log_modulus: CiphertextModulusLog, + output: &mut Self, + side_resources: &mut CudaSideResources, + ) { + let mut internal_output = self.duplicate(&side_resources.streams); + + cuda_modulus_switch_ciphertext( + &mut internal_output.0.d_vec, + storage_log_modulus.0 as u32, + &side_resources.streams, + ); + side_resources.streams.synchronize(); + let mut cpu_glwe = internal_output.to_glwe_ciphertext_list(&side_resources.streams); + + let shift_to_map_to_native = u64::BITS - storage_log_modulus.0 as u32; + for val in cpu_glwe.as_mut_view().into_container().iter_mut() { + *val <<= shift_to_map_to_native; + } + let d_after_ms = Self::from_glwe_ciphertext_list(&cpu_glwe, &side_resources.streams); + + *output = d_after_ms; + } +} + +impl AllocateLwePackingKeyswitchResult for CudaLwePackingKeyswitchKey { + type Output = CudaGlweCiphertextList; + type SideResources = CudaSideResources; + + fn allocate_lwe_packing_keyswitch_result( + &self, + side_resources: &mut Self::SideResources, + ) -> Self::Output { + let glwe_dimension = self.output_glwe_size().to_glwe_dimension(); + let polynomial_size = self.output_polynomial_size(); + let ciphertext_modulus = self.ciphertext_modulus(); + + CudaGlweCiphertextList::new( + glwe_dimension, + polynomial_size, + GlweCiphertextCount(1), + ciphertext_modulus, + &side_resources.streams, + ) + } +} + +impl LwePackingKeyswitch<[&CudaDynLwe], CudaGlweCiphertextList> + for CudaLwePackingKeyswitchKey +{ + type SideResources = CudaSideResources; + + fn keyswitch_lwes_and_pack_in_glwe( + &self, + input: &[&CudaDynLwe], + output: &mut CudaGlweCiphertextList, + side_resources: &mut CudaSideResources, + ) { + use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_128; + let input_lwe_ciphertext_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list( + input.iter().map(|ciphertext| ciphertext.as_lwe_128()), + &side_resources.streams, + ); + + cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_128( + self, + &input_lwe_ciphertext_list, + output, + &side_resources.streams, + ); + } +} diff --git a/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs b/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs index a8ee0a2fe..42e619518 100644 --- a/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs +++ b/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs @@ -27,7 +27,7 @@ use crate::shortint::server_key::ServerKey; use rayon::prelude::*; #[allow(clippy::too_many_arguments)] -fn dp_ks_any_ms_standard_pbs128< +pub fn dp_ks_any_ms_standard_pbs128< InputCt, ScalarMulResult, KsResult, @@ -111,7 +111,7 @@ where #[allow(clippy::too_many_arguments)] #[allow(clippy::type_complexity)] -fn dp_ks_any_ms_standard_pbs128_packing_ks< +pub fn dp_ks_any_ms_standard_pbs128_packing_ks< InputCt, ScalarMulResult, KsResult,